# Data Game

## I. Beginning

In [1]:
SESSION_USER = 'agalashov'
IS_REUSE = False

### Include libraries

In [2]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

%matplotlib inline
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
# import seaborn as sns
# sns.set(color_codes=True)
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Performing grid search

pd.options.display.max_rows = 100

import time
import datetime

# Feature Importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

# I/O tools
from lib import io_tools
from lib import preprocessing_tools
from lib import analysis_tools



### Data stats

In [3]:
def calculate_global_freq(dataframe, col_name):
    col_global_freq_name = str(col_name) + "_global_freq"
    col_global_flow_freq_name = str(col_name) + "_global_freq_flow"
    col_global_noflow_freq_name = str(col_name) + "_global_freq_noflow"
    col_global_listened_freq_name = str(col_name) + "_global_freq_listened"
    col_global_flow_listened_freq_name = str(col_name) + "_global_freq_flow_listened"    
    col_global_noflow_listened_freq_name = str(col_name) + "_global_freq_nopflow_listened"        
    
    cols = [col_name,
            col_global_freq_name,
            col_global_flow_freq_name,
            col_global_noflow_freq_name,
            col_global_listened_freq_name,
            col_global_flow_listened_freq_name,
            col_global_noflow_listened_freq_name]
    
    training_size = len(dataframe.index)
    
    print 'Executing calculations for ', col_name
    print 'Extracting listened type information'
    
    dataframe_flow = dataframe[dataframe['listen_type'] == 1]
    dataframe_noflow = dataframe[dataframe['listen_type'] == 0]
    dataframe_listened = dataframe[dataframe['is_listened'] == 1]
    dataframe_listened_flow = dataframe_listened[dataframe_listened['listen_type'] == 1]
    dataframe_listened_noflow = dataframe_listened[dataframe_listened['listen_type'] == 0]
    
    col_global_freq = dataframe[col_name].value_counts() / training_size
    col_global_flow_freq = dataframe_flow[col_name].value_counts() / training_size
    col_global_noflow_freq = dataframe_noflow[col_name].value_counts() / training_size
    col_global_listened_freq = dataframe_listened[col_name].value_counts() / training_size
    col_global_flow_listened_freq = dataframe_listened_flow[col_name].value_counts() / training_size
    col_global_noflow_listened_freq = dataframe_listened_noflow[col_name].value_counts() / training_size
    
    columns_dict = {col_global_freq_name : col_global_freq,
                    col_global_flow_freq_name: col_global_flow_freq,
                    col_global_noflow_freq_name: col_global_noflow_freq, 
                    col_global_listened_freq_name: col_global_listened_freq,
                    col_global_flow_listened_freq_name: col_global_flow_listened_freq,
                    col_global_noflow_listened_freq_name:  col_global_noflow_listened_freq
                   }
    result_dataframe = pd.DataFrame(columns_dict).reset_index().rename(columns={'index' : col_name}).fillna(value=0)
        
    return  result_dataframe
def add_global_freq(dataframe, global_freq_columns):
    result_dataframe = dataframe
    for col_name in global_freq_columns:        
        result_dataframe = pd.merge(result_dataframe, calculate_global_freq(result_dataframe, col_name), on=[col_name], how='left')
    return result_dataframe

In [4]:
def calculate_user_level_column(dataframe, col_name):
    user_ids = dataframe['user_id'].unique()
    col_freq_name = col_name + '_user_freq'
    col_freq_flow_name = col_name + '_user_freq_flow'
    col_freq_noflow_name = col_name + '_user_freq_noflow'
    col_freq_listened_name = col_name + '_user_freq_listened'
    col_freq_flow_listened_name = col_name + '_user_freq_flow_listened'
    col_freq_noflow_listened_name = col_name + '_user_freq_noflow_listened'
    cols = ['user_id', col_name, col_freq_name, col_freq_flow_name,
            col_freq_noflow_name, col_freq_listened_name, col_freq_flow_listened_name,
            col_freq_noflow_listened_name]
    result_dataframe = pd.DataFrame(columns=cols)
    print 'calculations for ', col_name, 'begin...'
    counter = 0
    for user_id in user_ids:
        dataframe_user = dataframe[dataframe['user_id'] == user_id]
        
        training_size = len(dataframe_user.index)
        
        dataframe_user_flow = dataframe_user[dataframe_user['listen_type'] == 1]
        dataframe_user_noflow = dataframe_user[dataframe_user['listen_type'] == 0]
        dataframe_user_listened = dataframe_user[dataframe_user['is_listened'] == 1]
        
        dataframe_user_flow_listened = dataframe_user_flow[dataframe_user_flow['is_listened'] == 1]
        dataframe_user_noflow_listened = dataframe_user_noflow[dataframe_user_noflow['is_listened'] == 1]
        
        col_freq_value = dataframe_user[col_name].value_counts() / training_size
        
        col_freq_flow_value = dataframe_user_flow[col_name].value_counts() / training_size
        
        col_freq_noflow_value = dataframe_user_noflow[col_name].value_counts() / training_size
        
        col_freq_listened_value = dataframe_user_listened[col_name].value_counts() / training_size

        col_freq_flow_listened_value = dataframe_user_flow_listened[col_name].value_counts() / training_size
        
        col_freq_noflow_listened_value = dataframe_user_noflow_listened[col_name].value_counts() / training_size
        
        columns_dict = {
            col_freq_name : col_freq_value,
            col_freq_flow_name : col_freq_flow_value,
            col_freq_noflow_name : col_freq_noflow_value,
            col_freq_listened_name : col_freq_listened_value,
            col_freq_flow_listened_name : col_freq_flow_listened_value,
            col_freq_noflow_listened_name : col_freq_noflow_listened_value,
        }
        
        df2 = pd.DataFrame(columns_dict).reset_index().rename(columns={'index' : col_name}).fillna(value=0)
        
#         df2 = pd.DataFrame([])
        
#         df2 = col_value_for_user.reset_index(name=col_freq_name).rename(columns={'index' : col_name})
        df2['user_id'] = user_id
        result_dataframe = result_dataframe.append(df2[cols])    
        counter+= 1
        
        if counter % 1000 == 0:
            print counter, user_id
    return result_dataframe
def add_user_level_frequencies(dataframe, user_frequency_columns):
    result_dataframe = dataframe
    for col_name in user_frequency_columns:
        result_dataframe = pd.merge(result_dataframe, calculate_user_level_column(result_dataframe, col_name), on=['user_id', col_name], how='left')
    return result_dataframe

In [5]:
def add_categorical_variables(dataframe, categorical_variables_list):
    def dummify(dataframe, categorical_variable):
        local_dummies = pd.get_dummies(dataframe[categorical_variable],prefix=categorical_variable)
        dataframe = pd.concat([dataframe,local_dummies],axis=1)
        return dataframe.drop(categorical_variable,axis=1)
    
    for categorical_variable in categorical_variables_list:
        dataframe = dummify(dataframe, categorical_variable)
    
    return dataframe

In [6]:
def aggregation_functions(dataframe, train_dataframe, name):
    combined = merged_datasets
    train = preprocessed_train_dataset

    avrg = train.groupby([name + '_id'])['is_listened'].mean()
    avrg_flow = train[train['listen_type'] == 1].groupby([name + '_id'])['is_listened'].mean()
    avrg_noflow = train[train['listen_type'] == 0].groupby([name + '_id'])['is_listened'].mean()

    avrg.name = 'avrg_listened_' + name
    avrg_flow.name = 'avrg_listened_' + name + '_flow'
    avrg_noflow.name = 'avrg_listened_' + name + '_noflow'

    combined = combined.join(avrg, name + '_id', 'left')
    combined = combined.join(avrg_flow, name + '_id', 'left')
    combined = combined.join(avrg_noflow, name + '_id', 'left')

    combined['avrg_listened_' + name].fillna(np.median(combined[~np.isnan(combined['avrg_listened_' + name])]['avrg_listened_' + name]), inplace=True)

    combined['avrg_listened_' + name + '_flow'].fillna(combined['avrg_listened_' + name], inplace=True)
    combined['avrg_listened_' + name + '_noflow'].fillna(combined['avrg_listened_' + name], inplace=True)

    combined['avrg_listened_' + name + '_current'] = combined['avrg_listened_' + name + '_flow']*combined['listen_type'] + combined['avrg_listened_' + name + '_noflow']*(1-combined['listen_type'])

    count_ = combined.groupby([name + '_id'])[name + '_id'].count()
    count_.name = 'count_' + name
    combined = combined.join(count_, name + '_id', 'left')
    
    return combined

In [7]:
def transform_dates(dataframe):
    conv_release_date_to_year = lambda x: datetime.datetime.strptime(str(x), '%Y%m%d').strftime('%Y')
    conv_timestamp_to_hour = lambda x: datetime.datetime.utcfromtimestamp(x).hour
    conv_timestamp_to_weekday = lambda x: datetime.datetime.utcfromtimestamp(x).weekday()
    dataframe['release_year'] = dataframe['release_date'].apply(conv_release_date_to_year)
    dataframe['timestamp_hour'] = dataframe['ts_listen'].apply(conv_timestamp_to_hour)
    dataframe['timestamp_weekday'] = dataframe['ts_listen'].apply(conv_timestamp_to_weekday)
    
    return dataframe

In [8]:
def add_time_moments(dataframe):
    dataframe['is_morning'] = ((dataframe['timestamp_hour'] >= 6) & (dataframe['timestamp_hour'] < 12)).astype(int)
    dataframe['is_day'] = ((dataframe['timestamp_hour'] >= 12) & (dataframe['timestamp_hour'] < 18)).astype(int)
    dataframe['is_evening'] = ((dataframe['timestamp_hour'] >= 18) & (dataframe['timestamp_hour'] < 22)).astype(int)
    dataframe['is_night'] = ((dataframe['timestamp_hour'] >= 22) | (dataframe['timestamp_hour'] < 6)).astype(int)                                                                 
    return dataframe

In [9]:
def transform_userage(dataframe):
    dataframe['18-21'] = ((dataframe['user_age'] >= 18) & (dataframe['user_age'] <= 21)).astype(int)
    dataframe['22-25'] = ((dataframe['user_age'] >= 22) & (dataframe['user_age'] <= 25)).astype(int)
    dataframe['26-30'] = ((dataframe['user_age'] >= 26) & (dataframe['user_age'] <= 30)).astype(int)
    return dataframe

In [10]:
def add_contex_type(dataframe):
    dataframe['context_type_1'] = (dataframe['context_type'] == 1).astype(int)
    dataframe['context_type_5'] = (dataframe['context_type'] == 5).astype(int)
    dataframe['context_type_20'] = (dataframe['context_type'] == 20).astype(int)
    dataframe['context_type_23'] = (dataframe['context_type'] == 23).astype(int)
    dataframe['context_type_ot'] = ((dataframe['context_type'] != 1) & (dataframe['context_type'] != 5) &
                                    (dataframe['context_type'] != 20) & (dataframe['context_type'] != 23)).astype(int)
    return dataframe

In [11]:
def add_features_to_test_set(train_set, test_set):
    columns = ['media_id', 'album_id', 'artist_id', 'genre_id']
    global_columns = []
    local_columns = ['user_id']
    for col in columns:    
        local_columns.append(col + '_user_freq')
        local_columns.append(col + '_user_freq_flow')
        local_columns.append(col + '_user_freq_noflow')
        local_columns.append(col + '_user_freq_listened')
        local_columns.append(col + '_user_freq_flow_listened')
        local_columns.append(col + '_user_freq_noflow_listened')    
        local_columns.append(col)

        global_columns.append(col)
        global_columns.append(col + '_global_freq')
        global_columns.append(col + '_global_freq_flow')
        global_columns.append(col + '_global_freq_noflow')
        global_columns.append(col + '_global_freq_listened')
        global_columns.append(col + '_global_freq_flow_listened')
        global_columns.append(col + '_global_freq_nopflow_listened')
        
    test_set = pd.merge(test_set, train_set[global_columns].drop_duplicates(), on=columns, how='left')    
    columns.append('user_id')
    test_set = pd.merge(test_set, train_set[local_columns].drop_duplicates(), on=columns, how='left')
    test_set = test_set.fillna(value=0)
    return test_set

### Import Data

In [None]:
if IS_REUSE == False:
    print 'Beginning features calculation'
    #LOAD DATA
    raw_train_dataset, raw_test_dataset, SAMPLE_ID_TEST = io_tools.read_data(train_filename='./data/train.csv',
                                                         test_filename='./data/test.csv')
    columns_to_calculacte = ['media_id', 'genre_id', 'album_id', 'user_id', 'artist_id']
    for col in columns_to_calculacte:
        print 'For column=', col,' train has ', len(raw_train_dataset[col].unique()), ' and test has ', len(raw_test_dataset[col].unique()),' values'
        
    preprocessed_train_dataset, preprocessed_test_dataset = preprocessing_tools.preprocess_data(raw_train_dataset, raw_test_dataset)    
    
    # Transform data formats
    preprocessed_train_dataset = transform_dates(preprocessed_train_dataset)
    preprocessed_test_dataset = transform_dates(preprocessed_test_dataset)
    
    # Add time moments
    preprocessed_train_dataset = add_time_moments(preprocessed_train_dataset)
    preprocessed_test_dataset = add_time_moments(preprocessed_test_dataset)
    
    # Transform userage
    preprocessed_train_dataset = transform_userage(preprocessed_train_dataset)
    preprocessed_test_dataset = transform_userage(preprocessed_test_dataset)
    
    # Add context type
    preprocessed_train_dataset = add_contex_type(preprocessed_train_dataset)
    preprocessed_test_dataset = add_contex_type(preprocessed_test_dataset)
    
    columns = ['media_id', 'album_id', 'artist_id', 'genre_id']
    preprocessed_train_dataset = add_global_freq(preprocessed_train_dataset, columns)
    preprocessed_train_dataset = add_user_level_frequencies(preprocessed_train_dataset, columns)
    
    preprocessed_test_dataset = add_features_to_test_set(preprocessed_train_dataset, preprocessed_test_dataset)
    
    date_str = datetime.datetime.now().strftime("%d_%m_%y_%H_%M")
    train_set_name = str(date_str) + "_"+ SESSION_USER + "_train.csv"
    test_set_name = str(date_str) + "_"+ SESSION_USER + "_test.csv"
    
    preprocessed_train_dataset.to_csv('./preprocessed_features/'+str(train_set_name))
    preprocessed_test_dataset.to_csv('./preprocessed_features/'+str(test_set_name))
    del raw_train_dataset
    del raw_test_dataset
else:
    print 'Reusing precalculated features'


Beginning features calculation
For column= media_id  train has  452975  and test has  9732  values
For column= genre_id  train has  2922  and test has  455  values
For column= album_id  train has  151471  and test has  7015  values
For column= user_id  train has  19918  and test has  19918  values
For column= artist_id  train has  67142  and test has  3950  values
17000 19120
18000 14268
19000 19202
5000 4254
6000 1616
7000 8151
8000 11632
9000 12992
10000 15942
11000 17694
12000 12283


In [None]:
print 'da'

In [None]:
preprocessed_train_dataset = pd.read_csv('important_preprocessed_train.csv')
preprocessed_train_dataset.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
raw_train_dataset, raw_test_dataset, SAMPLE_ID_TEST = io_tools.read_data(train_filename='./data/train.csv',
                                                         test_filename='./data/test.csv')

In [None]:

columns = ['media_id', 'album_id', 'artist_id', 'genre_id']
global_columns = []
local_columns = ['user_id']
for col in columns:    
    local_columns.append(col + '_user_freq')
    local_columns.append(col + '_user_freq_flow')
    local_columns.append(col + '_user_freq_noflow')
    local_columns.append(col + '_user_freq_listened')
    local_columns.append(col + '_user_freq_flow_listened')
    local_columns.append(col + '_user_freq_noflow_listened')    
    local_columns.append(col)
    
    global_columns.append(col)
    global_columns.append(col + '_global_freq')
    global_columns.append(col + '_global_freq_flow')
    global_columns.append(col + '_global_freq_noflow')
    global_columns.append(col + '_global_freq_listened')
    global_columns.append(col + '_global_freq_flow_listened')
    global_columns.append(col + '_global_freq_nopflow_listened')
    
preprocessed_test_dataset = pd.merge(preprocessed_test_dataset, preprocessed_train_dataset[columns_to_extract].drop_duplicates(), on=columns, how='left')    
columns.append('user_id')
preprocessed_test_dataset = pd.merge(preprocessed_test_dataset, preprocessed_train_dataset[local_columns].drop_duplicates(), on=columns, how='left')


In [None]:
preprocessed_test_dataset = preprocessed_test_dataset.fillna(value=0)

In [None]:
preprocessed_test_dataset.to_csv('./important_preprocessed_testset.csv')

In [None]:
del raw_train_dataset
del raw_test_dataset

In [None]:
def clean_before(dataframe):
    dataframe.drop(['genre_id', 'ts_listen', 'media_id',
                   'album_id', 'release_date', 'user_id', 'artist_id', 'release_date'], axis=1, inplace=True)
    dataframe.drop(['release_year', 'timestamp_weekday'], axis=1, inplace=True)
    return dataframe

In [None]:
preprocessed_train_dataset.columns

In [None]:
col_name = 'media_id'
print preprocessed_train_dataset[[col_name + str('_user_freq'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_user_freq_flow'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_user_freq_noflow'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_user_freq_listened'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_user_freq_flow_listened'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_user_freq_noflow_listened'), 'is_listened']].corr()

print preprocessed_train_dataset[[col_name + str('_global_freq'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_global_freq_flow'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_global_freq_noflow'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_global_freq_listened'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_global_freq_flow_listened'), 'is_listened']].corr()
print preprocessed_train_dataset[[col_name + str('_global_noflow_listened'), 'is_listened']].corr()

In [None]:
def postprocess(dataframe):
    dataframe.drop(['genre_id', 'ts_listen', 'media_id',
                   'album_id', 'release_date', 'user_id', 'artist_id', 'release_date'], axis=1, inplace=True)
    dataframe.drop(['release_year', 'timestamp_weekday'], axis=1, inplace=True)
    dataframe = add_time_moments(dataframe)
    dataframe.drop(['timestamp_hour'], axis=1, inplace=True)
    dataframe = transform_userage(dataframe)
    dataframe.drop(['user_age'], axis=1, inplace=True)
    dataframe = add_contex_type(dataframe)
    dataframe.drop(['context_type'], axis=1, inplace=True)    
#     dataframe = pd.get_dummies(dataframe, columns=['platform_name', 'platform_family'])
    return dataframe

In [None]:
# preprocessed_test_dataset = postprocess(preprocessed_test_dataset)
# preprocessed_test_dataset = pd.get_dummies(preprocessed_test_dataset, columns=['platform_name'])
# preprocessed_test_dataset = pd.get_dummies(preprocessed_test_dataset, columns=['platform_family'])

In [None]:
# preprocessed_test_dataset.to_csv('postprocessed_new_test.csv')

In [None]:
# preprocessed_train_dataset = postprocess(preprocessed_train_dataset)
# preprocessed_train_dataset = pd.get_dummies(preprocessed_train_dataset, columns=['platform_name'])
# preprocessed_train_dataset = pd.get_dummies(preprocessed_train_dataset, columns=['platform_family'])
# preprocessed_test_dataset = postprocess(preprocessed_test_dataset)

In [None]:
# preprocessed_train_dataset.to_csv('postprocessed_new_train.csv')

In [None]:
pd.get_dummies(preprocessed_train_dataset, columns=['platform_name'])

In [None]:
del preprocessed_test_dataset

In [None]:
preprocessed_test_dataset = postprocess(preprocessed_test_dataset)

In [None]:
preprocessed_train_dataset = preprocessed_train_dataset.fillna(value=0)
preprocessed_test_dataset = preprocessed_test_dataset.fillna(value=0)

In [None]:
# preprocessed_train_dataset.to_csv('postprocessed_new_train.csv')
# preprocessed_test_dataset.to_csv('postprocessed_new_test.csv')

In [None]:
del preprocessed_test_dataset

In [None]:
train_labels = preprocessed_train_dataset['is_listened'].as_matrix()
# preprocessed_train_dataset.drop(['is_listened'], axis=1, inplace=True)
train_set = preprocessed_train_dataset.as_matrix()
del preprocessed_train_dataset

In [None]:
amount_of_data = 1.0
n = np.shape(train_set)[0]
permutation = np.array(range(n))
np.random.shuffle(permutation)
part_of_data = train_set[:int(n * amount_of_data), :]
part_of_labels = train_labels[:int(n * amount_of_data)]

In [None]:
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

def crossvalidate_me(data, labels, depths, estimators, pca_components):
    n_folds = 3
    for depth in depths:
        for n_estimators in estimators:
            for n_pca in pca_components:
                print depth, n_estimators, n_pca
                kf = KFold(n_splits=n_folds)
                final_accuracy = 0

                for train, test in kf.split(data):
                    training_data, training_labels = data[train], labels[train]
                    test_data, test_labels = data[test], labels[test] 
                
                    pca = PCA(n_components=n_pca)
                    train_pca = pca.fit_transform(training_data)
                    scaler = StandardScaler()
                    train_scaled = scaler.fit_transform(train_pca)
                    
                    xgb = XGBClassifier(max_depth = depth, n_estimators=n_estimators)
    
                    print 'start training'
                    xgb.fit(train_scaled, training_labels)

                    test_pca = pca.transform(test_data)
                    test_pca_scaled = scaler.transform(test_pca)        

                    current_acc = accuracy_score(test_labels, xgb.predict(test_pca_scaled))

                    final_accuracy += current_acc

                    print 'done', current_acc
                print depth, n_estimators, n_pca, final_accuracy / n_folds

In [None]:
crossvalidate_me(part_of_data, part_of_labels, [3,4,5], estimators=[80,100], pca_components=[50,40])

In [None]:
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
kf = KFold(n_splits=5)
final_accuracy = 0
for train, test in kf.split(part_of_data):
    training_data, training_labels = part_of_data[train], part_of_labels[train]
    test_data, test_labels = part_of_data[test], part_of_labels[test] 
    pca = PCA(n_components=50)
    train_pca = pca.fit_transform(training_data)
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_pca)
    
    xgb = XGBClassifier(max_depth = 4, n_estimators=100)
    
    print 'start training'
    
    xgb.fit(train_scaled, training_labels)

    test_pca = pca.transform(test_data)
    test_pca_scaled = scaler.transform(test_pca)        
    
    current_acc = accuracy_score(test_labels, xgb.predict(test_pca_scaled))
    
    final_accuracy += current_acc
    
    print 'done', current_acc
    
final_accuracy /= 5
print final_accuracy    

In [None]:
train_set = pd.read_csv('postprocessed_new_train.csv')
# train_labels = train_set['is_listened'].as_matrix()
# train_set.drop(['is_listened', 'Unnamed: 0'], axis=1, inplace=True)

In [None]:
columns_to_drop = ['media_id_global_freq_flow',
                    'media_id_global_freq',
                    'media_id_global_freq_noflow',
                    'media_id_user_freq_flow', 
                    'media_id_user_freq_noflow',  
                   
                    'genre_id_global_freq',
                    'genre_id_global_freq_flow',
                    'genre_id_global_freq_flow_listened',
                    'genre_id_global_freq_listened',
                    'genre_id_global_freq_noflow',
                    'genre_id_global_freq_nopflow_listened',                    
                    'genre_id_user_freq_flow', 
                    'genre_id_user_freq_noflow',
                    
                    'artist_id_global_freq_flow',
                    'artist_id_global_freq',
                    'artist_id_global_freq_noflow',
                    'artist_id_user_freq_flow', 
                    'artist_id_user_freq_noflow',
                    
                    'album_id_global_freq_flow',
                    'album_id_global_freq',
                    'album_id_global_freq_noflow',
                    'album_id_user_freq_flow', 
                    'album_id_user_freq_noflow',
                   ]
#'genre_id', 
# colnames = ['media_id', 'album_id', 'artist_id']
# columns_to_drop = []

# for col in colnames:
#     for col_to_drop in colnames_to_drop:
#         columns_to_drop.append(str(col)+str(col_to_drop))
                    
                    
    

In [None]:
train_set.drop(columns_to_drop, axis=1, inplace=True)
train_set.drop(['platform_name_0', 'platform_name_1', 'platform_name_2', 'platform_family_0', 'platform_family_1', 'platform_family_2'], axis=1, inplace=True)
train_set.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
train_labels = train_set['is_listened'].as_matrix()
train_set.drop(['is_listened'], axis=1, inplace=True)
train_set = train_set.as_matrix()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_set, train_labels, test_size=0.2)

In [None]:
del train_set, train_labels

In [None]:
from sklearn.decomposition import IncrementalPCA
ipca = IncrementalPCA(n_components=35, batch_size=1024)

In [None]:
X_train = ipca.fit_transform(X_train)
X_test = ipca.transform(X_test)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
3 100 begin
train_acc:  0.8586797882
test_acc: 0.85

In [None]:
print 'da'

In [None]:
depths = [3,4]
n_estimators = [100, 300, 500]
for depth in depths:
    for n_estimator in n_estimators:
        if depth == 3 and n_estimator == 100:
            continue
        else:
            print depth, n_estimator, 'begin'
            xgb = XGBClassifier(max_depth = depth, n_estimators=n_estimator)
            xgb.fit(X_train, y_train)
            predicted = xgb.predict(X_test)
            print 'train_acc: ', accuracy_score(y_train, xgb.predict(X_train))
            print 'test_acc: ', accuracy_score(y_test, predicted)

In [None]:
depths = [5, 6, 7]
n_estimators = [100, 300, 500, 1000]
for depth in depths:
    for n_estimator in n_estimators:
        print depth, n_estimator, 'begin'
        xgb = XGBClassifier(max_depth = depth, n_estimators=n_estimator)
        xgb.fit(X_train, y_train)
        predicted = xgb.predict(X_test)
        print 'train_acc: ', accuracy_score(y_train, xgb.predict(X_train))
        print 'test_acc: ', accuracy_score(y_test, predicted)

In [None]:
from sklearn.decomposition import IncrementalPCA
ipca = IncrementalPCA(n_components=35, batch_size=1024)
train_set = ipca.fit_transform(train_set)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_set = scaler.fit_transform(train_set)

In [None]:
xgb = XGBClassifier(max_depth = 5, n_estimators=1000)
xgb.fit(train_set, train_labels)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_set = scaler.fit_transform(train_set)

In [None]:
xgb = XGBClassifier(max_depth = 4, n_estimators=100)
xgb.fit(train_set, train_labels)

In [None]:
test_set = pd.read_csv('postprocessed_new_test.csv')

In [None]:
test_set.drop(columns_to_drop, axis=1, inplace=True)
test_set.drop(['platform_name_0', 'platform_name_1', 'platform_name_2', 'platform_family_0', 'platform_family_1', 'platform_family_2'], axis=1, inplace=True)
test_set.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
test_set = test_set.as_matrix()

In [None]:
test_set = ipca.transform(test_set)
test_set = scaler.transform(test_set)

In [None]:
predicted = xgb.predict(test_set)

In [None]:
_, _, SAMPLE_ID_TEST = io_tools.read_data(train_filename='./data/train.csv',
                                                         test_filename='./data/test.csv')

In [None]:
df_output = pd.DataFrame()
df_output['sample_id'] = SAMPLE_ID_TEST
df_output['is_listened'] = predicted
df_output[['sample_id','is_listened']].to_csv('./predictions_SANYA.csv', sep = ",", index=False)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(train_labels, xgb.predict(train_set))

In [None]:
amount_of_data = 0.75
n = np.shape(train_set)[0]
permutation = np.array(range(n))
np.random.shuffle(permutation)
part_of_data = train_set[:int(n * amount_of_data), :]
part_of_labels = train_labels[:int(n * amount_of_data)]

In [None]:
del train_set, train_labels

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=35)
pca.fit(part_of_data)

In [None]:
# for i in range(np.shape(train_set)[1]):
#     train_set[:,i] -= np.mean(train_set[:,i])
#     train_set[:,i] /= np.std(train_set[:,i])
# from sklearn import preprocessing
# train_set = preprocessing.scale(train_set)
# np.std(train_set[:,0])
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_set)
# part_of_data = scaler.fit_transform(part_of_data)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(train_labels, xgb.predict(train_set))

In [None]:
# del ExtraTreesClassifier, GridSearchCV, SelectFromModel, cross_validation
# del x, k, xgb, plt
# del XGBClassifier
# del getsizeof
# del scaler
# del train_labels
# del analysis_tools
# del datasets
# del io_tools
# del metrics
# del preprocessing_tools

In [None]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [None]:
xgb = XGBClassifier(max_depth = 4, n_estimators=100)

In [None]:
part_of_labels

In [None]:
xgb.fit(train_set, train_labels)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(part_of_labels, xgb.predict(part_of_data))

In [None]:
del part_of_data, part_of_labels

In [None]:
test_set = pd.read_csv('postprocessed_new_test.csv')
test_set

In [None]:
test_set = pd.read_csv('postprocessed_new_test.csv')
test_set.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
test_set = test_set.as_matrix()

In [None]:
test_set = pca.transform(test_set)
test_set = scaler.transform(test_set)

In [None]:
del pca, scaler

In [None]:
predicted = xgb.predict(test_set)

In [None]:
del xgb

In [None]:
_, _, SAMPLE_ID_TEST = io_tools.read_data(train_filename='./data/train.csv',
                                                         test_filename='./data/test.csv')

In [None]:
df_output = pd.DataFrame()
df_output['sample_id'] = SAMPLE_ID_TEST
df_output['is_listened'] = predicted
df_output[['sample_id','is_listened']].to_csv('./predictions_LEXA.csv', sep = ",", index=False)

In [None]:
print xgb

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(train_labels, xgb.predict(train_set))

In [None]:
del train_labels
del train_set

In [None]:
print 'lexa'

In [None]:
preprocessed_test_dataset = pd.read_csv('preprocessed_test_set_da.csv')

In [None]:
del train_set
del train_labels

In [None]:
# Fit the model
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27)
xgb1.fit(part_of_data, part_of_labels)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(part_of_labels, xgb1.predict(part_of_data))

In [None]:
print 'da'

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
train_set

In [None]:
scaler.fit(train_set)

In [None]:
del train_labels

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=80)
pca.fit(train_set)
# train_set = pca.fit_transform(train_set)
# test_set = pca.transform(test_set)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_set)
# train_set = 
# test_set = scaler.transform(train_set)

In [None]:
preprocessed_train_dataset.drop(['context_type', 'release_year'], axis=1, inplace=True)

In [None]:
preprocessed_train_dataset

In [None]:
# just_for_fun = preprocessed_train_dataset[]
# preprocessed_train_dataset[['release_year', 'is_listened']].corr()

In [None]:
preprocessed_train_dataset = pd.get_dummies(preprocessed_train_dataset, columns=['timestamp_hour'])

In [None]:
# ['timestamp_hour', 'release_year', 'context_type']

In [None]:
pd.get_dummies(preprocessed_train_dataset['timestamp_hour'])

In [None]:
# merged_datasets = preprocessing_tools.get_merged_datasets(preprocessed_train_dataset, preprocessed_test_dataset)

In [None]:
columns_to_check = ['timestamp_hour', 'timestamp_weekday', 'release_year', 'context_type', 'user_age', 'platform_name', 'platform_family', 'user_gender', 'listen_type']
for col in columns_to_check:
    print col, 'train: ', len(preprocessed_train_dataset[col].unique()), ' test: ', len(preprocessed_test_dataset[col].unique())

In [None]:
preprocessed_train_dataset.drop(['genre_id','ts_listen','media_id',
                                 'album_id', 
                                 'release_date',
                                 'user_id',
                                 'artist_id',
                                 'release_date'], axis=1, inplace=True)

In [None]:
preprocessed_train_dataset

In [None]:
pd.get_dummies(preprocessed_train_dataset, columns=['listen_type'])

In [None]:
def add_categorical_variable(train, test, variable):
    return add_categorical_variables(train, [variable]), add_categorical_variables(test, [variable])

In [None]:
add_categorical_variable(preprocessed_train_dataset, preprocessed_test_dataset, 'listen_type')

In [None]:
preprocessed_train_dataset = add_categorical_variables(preprocessed_train_dataset, ['timestamp_hour', 'timestamp_weekday', 'user_age', 'platform_name', 'platform_family', 'user_gender', 'listen_type'])
preprocessed_test_dataset = add_categorical_variables(preprocessed_test_dataset, ['timestamp_hour', 'timestamp_weekday', 'user_age', 'platform_name', 'platform_family', 'user_gender', 'listen_type'])

In [None]:
merged_datasets = add_categorical_variables(merged_datasets, ['timestamp_hour', 'timestamp_weekday', 'release_year', 'context_type', 'user_age', 'platform_name', 'platform_family', 'user_gender', 'listen_type'])
# merged_datasets = merged_datasets.drop(['ts_listen', 'genre_id', 'media_id', 'album_id', 'user_id', 'artist_id', 'release_date'], axis=1)

In [None]:
training_size = len(preprocessed_train_dataset.index)

In [None]:
train_set = (merged_datasets[0:training_size]).as_matrix()
test_set = (merged_datasets[training_size:]).as_matrix()
train_labels = preprocessed_train_dataset['is_listened'].values

In [None]:
del preprocessed_train_dataset
del preprocessed_test_dataset

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_set = scaler.fit_transform(train_set)
test_set = scaler.transform(train_set)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
train_set = pca.fit_transform(train_set)
test_set = pca.transform(test_set)

In [None]:
# Fit the model
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27)
xgb1.fit(train_set, train_labels)

In [None]:
gbrpred = xgb1.predict(test_set)

In [None]:
df_output = pd.DataFrame()
df_output['sample_id'] = IDtest
df_output['is_listened'] = gbrpred
df_output[['sample_id','is_listened']].to_csv('./new_output.csv', sep = ",", index=False)

<h1> III. Feature Engineering </h1>

## Genre_id, media_id, album_id, user_id, artist_id -> aggregate (e.g. count)

### Mean of is_listened by ___ on flow or not

In [None]:
merged_datasets = aggregation_functions(merged_datasets, preprocessed_train_dataset, 'user')
merged_datasets = aggregation_functions(merged_datasets, preprocessed_train_dataset, 'artist')
merged_datasets = aggregation_functions(merged_datasets, preprocessed_train_dataset, 'media')
merged_datasets = aggregation_functions(merged_datasets, preprocessed_train_dataset, 'album')
merged_datasets = aggregation_functions(merged_datasets, preprocessed_train_dataset, 'genre')

In [None]:
merged_datasets['ts_listen']

### Compute age of song at the moment of listening (still some weird things on very few points...)

In [None]:
combined['age_song'] = combined['ts_listen'] - combined['release_date']

In [None]:
combined

## Other ideas: compute mean length for an album, an artist, a genre, mean of is_listened for each user, each artist, etc using the date 

## Using the date, we can compute the number of songs he listened in a row

### Time since previous song (what to do with the first value?)

In [None]:
combined_sorted = combined[['ts_listen', 'user_id']].sort_values(['user_id', 'ts_listen'])

In [None]:
combined_sorted['time_to_prev'] = combined_sorted['user_id']
combined_sorted['time_to_next'] = combined_sorted['user_id']

# for usr in combined_sorted['user_id'].unique():
#     lts = np.array([0] + list(combined_sorted[combined_sorted['user_id'] == usr]['ts_listen']) + [time.time()])
#     combined_sorted[combined_sorted['user_id'] == usr]['time_to_prev'] = (lts[1:] - lts[:-1])[:-1]
#     combined_sorted[combined_sorted['user_id'] == usr]['time_to_next'] = (lts[1:] - lts[:-1])[1:]

In [None]:
combined = combined.join(combined_sorted[['time_to_prev','time_to_next']], None, 'left', 'l')

In [None]:
combined

## CENTER DATA

In [None]:
for _ in combined.columns:
    if np.std(combined[_][:len(train)]) != 0:
        combined[_] = (combined[_] - np.mean(combined[_][:len(train)]))/np.std(combined[_][:len(train)])

<h1> IV. Modeling </h1>

## Separate the modified train and test sets

In [None]:
combined.drop('index',inplace=True,axis=1)
newtrain = combined[:len(train)]
newtest = combined[len(train):]

## A function to visualize the importance of the features

In [None]:
def importanceVisualisation(feature_importance, predictors, firstN = 40):
    
    plt.rcParams["figure.figsize"] = [40,10]
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.subplot(1, 2, 2)
    plt.bar(pos[-firstN:], feature_importance[sorted_idx][-firstN:], align='center')
    plt.xticks(pos[-firstN:], predictors[sorted_idx][-firstN:], rotation='vertical')
    plt.ylabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

<h1> IV - 1. Gradient Boosting </h1>

In [None]:
from sklearn import ensemble
params = {'n_estimators': 1000, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'huber', 'verbose':1}
gbr = ensemble.GradientBoostingRegressor(**params)

In [None]:
gbr.fit(train_set, train_labels)

In [None]:
gbr.predict(test_set)

In [None]:
importanceVisualisation(gbr.feature_importances_, newtrain.columns)

<h1> IV - 2. XGBoost </h1>

In [None]:
from sklearn.model_selection import cross_val_predict

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50, plot = False):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=target.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], target, eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(target.values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(target, dtrain_predprob))
    predicted_cv = cross_val_predict(alg, dtrain, target, cv = 5, n_jobs = -1)
    print('AUC Score (CV):',metrics.roc_auc_score(target, predicted_cv))
    
    if plot:
        importanceVisualisation(pd.Series(alg.booster().get_fscore()), predictors)

In [None]:
predictors = newtrain.columns

## FIRST MODEL

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27)
xgb1.fit(train_set, train_labels)
gbrpred = xgb1.predict(test_set)
df_output = pd.DataFrame()
df_output['sample_id'] = IDtest
df_output['is_listened'] = gbrpred
df_output[['sample_id','is_listened']].to_csv('./predictions/GBRoutput.csv', sep = ",", index=False)

## GRID SEARCH ON XGBOOST TO FIND THE BEST PARAMETERS (VERY LONG)

## Grid search on 'max_depth' and 'min_child_weight'

In [None]:
# param_test1 = {
#  'max_depth':np.array(range(3,10,2)),
#  'min_child_weight':np.array(range(1,6,2))
# }
# gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
#  min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
#  param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch1.fit(newtrain[predictors],target)
# gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

## Finer grid search on 'max_depth' and 'min_child_weight'

In [None]:
# param_test2 = {
#  'max_depth':[2,3,4],
#  'min_child_weight':[2,3,4]
# }
# gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
#  min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test2, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch2.fit(newtrain[predictors],target)
# gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

## Grid search on 'gamma'

In [None]:
# param_test3 = {
#  'gamma':[i/10.0 for i in range(0,5)]
# }
# gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
#  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test3, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch3.fit(newtrain[predictors],target)
# gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

## Grid search on 'subsample' and 'colsample_bytree'

In [None]:
# param_test4 = { 
#  'subsample':[i/10.0 for i in range(6,11)],
#  'colsample_bytree':[i/10.0 for i in range(6,11)]
# }
# gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
#  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test4, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch4.fit(newtrain[predictors],target)
# gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

## Grid search on 'reg_alpha'

In [None]:
# param_test6 = {
#  'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
# }
# gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
#  min_child_weight=4, gamma=0, subsample=1.0, colsample_bytree=0.6,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test6, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch6.fit(newtrain,target)
# gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

In [None]:
# param_test7 = {
#  'reg_alpha':[0.5,1,2,5,10]
# }
# gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
#  min_child_weight=4, gamma=0, subsample=1.0, colsample_bytree=0.6,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test7, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch7.fit(newtrain,target)
# gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

## Grid search on 'learning_rate' and 'n_estimators'

In [None]:
param_test8 = {
 'learning_rate': [0.1,0.01,0.001],
 'n_estimators': [1000,5000,10000]
}
gsearch8 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=0, gamma=0.3, subsample=0.9, reg_alpha = 0.2, colsample_bytree=0.9,
 objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=40), 
 param_grid = param_test8, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
gsearch8.fit(newtrain,target)
gsearch8.grid_scores_, gsearch8.best_params_, gsearch8.best_score_

## FINAL MODEL

In [None]:
# XgbParams = {'learning_rate': 0.001, 'n_estimators':10000, 'max_depth':6, 'min_child_weight':3, 'gamma':0,
#             'subsample':0.9, 'colsample_bytree':0.6, 'reg_alpha':1e-5, 'objective': 'binary:logistic',
#              'nthread':-1, 'scale_pos_weight':1}

XgbParams = {'learning_rate': 0.001, 'n_estimators':1000, 'max_depth':5, 'min_child_weight':0, 'gamma':0.3,
            'subsample':0.9, 'colsample_bytree':0.9, 'reg_alpha':0.2, 'objective': 'binary:logistic',
             'nthread':-1, 'scale_pos_weight':1}

# XgbParams = {'learning_rate': 0.01, 'n_estimators':1000, 'max_depth':4, 'min_child_weight':4, 'gamma':0,
#             'subsample':1, 'colsample_bytree':0.6, 'reg_alpha':1, 'objective': 'binary:logistic',
#              'nthread':-1, 'scale_pos_weight':1}

In [None]:
xgb3 = XGBClassifier(**XgbParams, seed=27)
modelfit(xgb3, newtrain, predictors, plot = False)

In [None]:
xgb3 = XGBClassifier(**XgbParams, seed=40)
modelfit(xgb3, newtrain, predictors, plot = False)

<h1> VI. Final Prediction </h1> (We take the best xgboost model here)

In [None]:
gbrpred = gbr.predict(newtest)
df_output = pd.DataFrame()
df_output['sample_id'] = IDtest
df_output['is_listened'] = gbrpred
df_output[['sample_id','is_listened']].to_csv('./predictions/GBRoutput.csv', sep = ",", index=False)