<h1> I. Beginning </h1>

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

%matplotlib inline
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import seaborn as sns
sns.set(color_codes=True)
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Performing grid search

pd.options.display.max_rows = 100



In [2]:
# Feature Importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

## Import Data

In [16]:
train = pd.read_csv('./data/train.csv', sep = ",")
test = pd.read_csv('./data/test.csv', sep = ",")

In [17]:
IDtest = test['sample_id']
test.drop('sample_id',inplace=True,axis=1)

In [18]:
test.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age
0,50,1478104371,683078,82356,1,20021008,0,0,542,1,0,17698,2076,30
1,2744,1479317140,876497,99692,1,19851231,0,0,307,1,0,10525,26,28
2,2744,1479546361,876497,99692,1,19851231,0,0,307,1,0,8716,26,27
3,2744,1478457729,876500,99692,1,19851231,2,1,265,1,0,5443,26,30
4,2744,1480448560,876504,99692,1,19851231,2,1,356,1,0,7600,26,29


In [19]:
train.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened
0,25471,1480597215,222606,41774,12,20040704,1,0,223,0,0,9241,55164,29,0
1,25571,1480544735,250467,43941,0,20060301,2,1,171,0,0,16547,55830,30,1
2,16,1479563953,305197,48078,1,20140714,2,1,149,1,1,7665,2704,29,1
3,7,1480152098,900502,71521,0,20001030,0,0,240,0,1,1580,938,30,0
4,7,1478368974,542335,71718,0,20080215,0,0,150,0,1,1812,2939,24,1


### Take out absurd data

In [20]:
train.drop(train.index[(train['media_duration'] <= 30)*(train['is_listened'] == 1)], axis = 0, inplace = True)

In [21]:
import time
import datetime
conv_date = lambda x: time.mktime(datetime.datetime.strptime(str(x), '%Y%m%d').timetuple())
train['release_date'] = train['release_date'].apply(conv_date)
test['release_date'] = test['release_date'].apply(conv_date)

In [22]:
train.drop(train.index[train['ts_listen'] > time.time()], axis = 0, inplace = True)
train.drop(train.index[train['ts_listen'] < 1230764491], axis = 0, inplace = True)
train.drop(train.index[train['release_date'] > time.time()], axis = 0, inplace = True)
# train.drop(train.index[train['ts_listen'] < train['release_date']], axis = 0, inplace = True) 28827 valeurs dans le train, 32 dans le test, presques toutes avec genre_id == 0

In [23]:
# not so bad anyways... - and none in the test
# len(train[train['ts_listen'] < 1230764491]) (= 631)

### Combined contains the common columns to the test and train set, in order to apply the same transformations on all the data

In [24]:
target = train['is_listened']
train.drop('is_listened', axis = 1, inplace = True)
combined = train.append(test)
combined.reset_index(inplace=True)
combined.drop('index', axis = 1, inplace = True)
train['is_listened'] = target

In [25]:
combined.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age
0,25471,1480597215,222606,41774,12,1088892000.0,1,0,223,0,0,9241,55164,29
1,25571,1480544735,250467,43941,0,1141168000.0,2,1,171,0,0,16547,55830,30
2,16,1479563953,305197,48078,1,1405289000.0,2,1,149,1,1,7665,2704,29
3,7,1480152098,900502,71521,0,972860400.0,0,0,240,0,1,1580,938,30
4,7,1478368974,542335,71718,0,1203030000.0,0,0,150,0,1,1812,2939,24


In [26]:
# Took the data from "input.csv"

# How to process it?

# Genre_id, media_id, album_id, user_id, artist_id -> aggregate (e.g. count)
# Ts_listen, release_date: date under 2 different formats -> put to same format
# Context_type -> one-hot-encode
# Platform_name, platform_family -> one-hot encode? Aggregate?
# Media_duration -> this one seems simple, keep as is
# Listen_type -> probably keep as is, but not sure
# User_gender -> keep as is (sexism!)
# User_age -> keep as is

# Other ideas: compute mean length for an album, an artist, a genre, mean of is_listened for each user, each artist, etc using the date 
# Using the date, we can compute the number of songs he listened in a row

# I think the key here is correctly using the information about artist, etc...


<h1> II. Feature Preparation </h1>

In [27]:
# def medianNan(s):
#     global combined
#     combined[s].fillna(np.median(combined[~np.isnan(combined[s])][s]), inplace=True)

### Filling NaN values with the median of the column

In [28]:
# for s in combined.columns[1:]:
# #     print(s,type(combined[s][0]))
#     medianNan(s)

### Dealing with categoricals

In [29]:
categorical = ['platform_name', 'platform_family'] ## 'context_type' TOO BIG! (MemoryError)

In [30]:
def dummify(variable):
    global combined
    local_dummies = pd.get_dummies(combined[variable],prefix=variable)
    combined = pd.concat([combined,local_dummies],axis=1)
    combined.drop(variable,axis=1,inplace=True)

In [31]:
for s in categorical:
    dummify(s)

## DATA VISUALISATION

### Same user ID's in test and train

In [32]:
sorted(train['user_id'].unique()) == sorted(test['user_id'].unique())

True

### listen_type == 1 in test set (apart from one value)

In [33]:
sorted(test['listen_type'])[1]

1

<h1> III. Feature Engineering </h1>

## Genre_id, media_id, album_id, user_id, artist_id -> aggregate (e.g. count)

In [34]:
def aggregation_functions(name):
    
    global combined
    global train
    
    avrg = train.groupby([name + '_id'])['is_listened'].mean()
    avrg_flow = train[train['listen_type'] == 1].groupby([name + '_id'])['is_listened'].mean()
    avrg_noflow = train[train['listen_type'] == 0].groupby([name + '_id'])['is_listened'].mean()

    avrg.name = 'avrg_listened_' + name
    avrg_flow.name = 'avrg_listened_' + name + '_flow'
    avrg_noflow.name = 'avrg_listened_' + name + '_noflow'

    combined = combined.join(avrg, name + '_id', 'left')
    combined = combined.join(avrg_flow, name + '_id', 'left')
    combined = combined.join(avrg_noflow, name + '_id', 'left')

    combined['avrg_listened_' + name].fillna(np.median(combined[~np.isnan(combined['avrg_listened_' + name])]['avrg_listened_' + name]), inplace=True)
    
    combined['avrg_listened_' + name + '_flow'].fillna(combined['avrg_listened_' + name], inplace=True)
    combined['avrg_listened_' + name + '_noflow'].fillna(combined['avrg_listened_' + name], inplace=True)

    combined['avrg_listened_' + name + '_current'] = combined['avrg_listened_' + name + '_flow']*combined['listen_type'] + combined['avrg_listened_' + name + '_noflow']*(1-combined['listen_type'])

    count_ = combined.groupby([name + '_id'])[name + '_id'].count()
    count_.name = 'count_' + name
    combined = combined.join(count_, name + '_id', 'left')

### Mean of is_listened by ___ on flow or not

In [35]:
aggregation_functions('user')
aggregation_functions('artist')
aggregation_functions('media')
aggregation_functions('album')
aggregation_functions('genre')

### Compute age of song at the moment of listening (still some weird things on very few points...)

In [36]:
combined['age_song'] = combined['ts_listen'] - combined['release_date']

In [37]:
combined.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,media_duration,listen_type,user_gender,user_id,artist_id,user_age,platform_name_0,platform_name_1,platform_name_2,platform_family_0,platform_family_1,platform_family_2,avrg_listened_user,avrg_listened_user_flow,avrg_listened_user_noflow,avrg_listened_user_current,count_user,avrg_listened_artist,avrg_listened_artist_flow,avrg_listened_artist_noflow,avrg_listened_artist_current,count_artist,avrg_listened_media,avrg_listened_media_flow,avrg_listened_media_noflow,avrg_listened_media_current,count_media,avrg_listened_album,avrg_listened_album_flow,avrg_listened_album_noflow,avrg_listened_album_current,count_album,avrg_listened_genre,avrg_listened_genre_flow,avrg_listened_genre_noflow,avrg_listened_genre_current,count_genre,age_song
0,25471,1480597215,222606,41774,12,1088892000.0,223,0,0,9241,55164,29,0.0,1.0,0.0,1.0,0.0,0.0,0.676991,0.654762,0.690141,0.690141,227,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.68812,0.596593,0.737953,0.737953,23025,391705215.0
1,25571,1480544735,250467,43941,0,1141168000.0,171,0,0,16547,55830,30,0.0,0.0,1.0,0.0,1.0,0.0,0.575,0.0,0.589744,0.589744,41,0.37931,0.714286,0.272727,0.272727,30,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,1,0.567614,0.548518,0.576582,0.576582,2325,339377135.0
2,16,1479563953,305197,48078,1,1405289000.0,149,1,1,7665,2704,29,0.0,0.0,1.0,0.0,1.0,0.0,0.986755,0.977273,0.990654,0.977273,303,0.78125,0.833333,0.775862,0.833333,128,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,1,0.743868,0.663128,0.779587,0.663128,4615,74275153.0
3,7,1480152098,900502,71521,0,972860400.0,240,0,1,1580,938,30,1.0,0.0,0.0,1.0,0.0,0.0,0.586977,0.544218,0.603073,0.603073,1076,0.714563,0.666667,0.722919,0.722919,1031,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.663092,0.601317,0.705924,0.705924,931953,507291698.0
4,7,1478368974,542335,71718,0,1203030000.0,150,0,1,1812,2939,24,1.0,0.0,0.0,1.0,0.0,0.0,0.933868,0.9375,0.933002,0.933002,999,0.806763,0.75,0.823899,0.823899,207,0.866667,0.785714,0.9375,0.9375,30,0.882353,0.8,0.947368,0.947368,34,0.663092,0.601317,0.705924,0.705924,931953,275338974.0


## Other ideas: compute mean length for an album, an artist, a genre, mean of is_listened for each user, each artist, etc using the date 

### Time since previous song (what to do with the first value?)

In [106]:
combined_sorted = combined[['ts_listen', 'user_id']].sort_values(['user_id', 'ts_listen'])

In [107]:
ttnxt = -(combined_sorted - combined_sorted.shift(-1))
ttnxt['ts_listen'].ix[ttnxt['user_id'] != 0] = np.nan
ttnxt['ts_listen'].fillna(int(np.max(ttnxt[~np.isnan(ttnxt['ts_listen'])]['ts_listen'])), inplace=True)

In [108]:
ttprv = combined_sorted - combined_sorted.shift(1)
ttprv['ts_listen'].ix[ttprv['user_id'] != 0] = np.nan
ttprv['ts_listen'].fillna(int(np.max(ttprv[~np.isnan(ttprv['ts_listen'])]['ts_listen'])), inplace=True)

In [109]:
combined_sorted['time_to_prev'] = ttprv['ts_listen']
combined_sorted['time_to_next'] = ttnxt['ts_listen']

In [111]:
combined_sorted.head()

Unnamed: 0,ts_listen,user_id,time_to_prev,time_to_next
682395,1477939775,0,248904905.0,42756.0
4454309,1477982531,0,42756.0,30.0
5353733,1477982561,0,30.0,21.0
5898585,1477982582,0,21.0,210.0
4715887,1477982792,0,210.0,186.0


In [112]:
combined = combined.join(combined_sorted[['time_to_prev','time_to_next']], None, 'left', 'l')

In [113]:
combined.head()

Unnamed: 0.1,Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,media_duration,listen_type,user_gender,user_id,artist_id,user_age,platform_name_0,platform_name_1,platform_name_2,platform_family_0,platform_family_1,platform_family_2,avrg_listened_user,avrg_listened_user_flow,avrg_listened_user_noflow,avrg_listened_user_current,count_user,avrg_listened_artist,avrg_listened_artist_flow,avrg_listened_artist_noflow,avrg_listened_artist_current,count_artist,avrg_listened_media,avrg_listened_media_flow,avrg_listened_media_noflow,avrg_listened_media_current,count_media,avrg_listened_album,avrg_listened_album_flow,avrg_listened_album_noflow,avrg_listened_album_current,count_album,avrg_listened_genre,avrg_listened_genre_flow,avrg_listened_genre_noflow,avrg_listened_genre_current,count_genre,age_song,time_to_prev,time_to_next
0,0,25471,1480597215,222606,41774,12,1088892000.0,223,0,0,9241,55164,29,0.0,1.0,0.0,1.0,0.0,0.0,0.676991,0.654762,0.690141,0.690141,227,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.68812,0.596593,0.737953,0.737953,23025,391705215.0,12.0,7.0
1,1,25571,1480544735,250467,43941,0,1141168000.0,171,0,0,16547,55830,30,0.0,0.0,1.0,0.0,1.0,0.0,0.575,0.0,0.589744,0.589744,41,0.37931,0.714286,0.272727,0.272727,30,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,1,0.567614,0.548518,0.576582,0.576582,2325,339377135.0,194.0,35.0
2,2,16,1479563953,305197,48078,1,1405289000.0,149,1,1,7665,2704,29,0.0,0.0,1.0,0.0,1.0,0.0,0.986755,0.977273,0.990654,0.977273,303,0.78125,0.833333,0.775862,0.833333,128,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,1,0.743868,0.663128,0.779587,0.663128,4615,74275153.0,33.0,151.0
3,3,7,1480152098,900502,71521,0,972860400.0,240,0,1,1580,938,30,1.0,0.0,0.0,1.0,0.0,0.0,0.586977,0.544218,0.603073,0.603073,1076,0.714563,0.666667,0.722919,0.722919,1031,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.663092,0.601317,0.705924,0.705924,931953,507291698.0,6.0,8.0
4,4,7,1478368974,542335,71718,0,1203030000.0,150,0,1,1812,2939,24,1.0,0.0,0.0,1.0,0.0,0.0,0.933868,0.9375,0.933002,0.933002,999,0.806763,0.75,0.823899,0.823899,207,0.866667,0.785714,0.9375,0.9375,30,0.882353,0.8,0.947368,0.947368,34,0.663092,0.601317,0.705924,0.705924,931953,275338974.0,26052.0,568.0


## CENTER DATA

In [114]:
combined.to_csv('temp_combined.csv')
target.to_csv('temp_target.csv')

In [3]:
combined = pd.read_csv('temp_combined.csv')
target = pd.read_csv('temp_target.csv', header = None)[1]

In [None]:
for _ in combined.columns:
    if np.std(combined[_][:len(target)]) != 0:
        combined[_] = (combined[_] - np.mean(combined[_][:len(target)]))/np.std(combined[_][:len(target)])

<h1> IV. Modeling </h1>

## Separate the modified train and test sets

In [None]:
newtrain = combined[:len(target)]
newtest = combined[len(target):]

## A function to visualize the importance of the features

In [None]:
def importanceVisualisation(feature_importance, predictors, firstN = 40):
    
    plt.rcParams["figure.figsize"] = [40,10]
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.subplot(1, 2, 2)
    plt.bar(pos[-firstN:], feature_importance[sorted_idx][-firstN:], align='center')
    plt.xticks(pos[-firstN:], predictors[sorted_idx][-firstN:], rotation='vertical')
    plt.ylabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

<h1> IV - 1. Gradient Boosting </h1>

In [None]:
from sklearn import ensemble
params = {'n_estimators': 1000, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'huber', 'verbose':1}
gbr = ensemble.GradientBoostingRegressor(**params)

In [None]:
gbr.fit(newtrain, target)

In [None]:
importanceVisualisation(gbr.feature_importances_, newtrain.columns)

<h1> IV - 2. XGBoost </h1>

In [22]:
from sklearn.model_selection import cross_val_predict

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50, plot = False):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=target.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], target, eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(target.values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(target, dtrain_predprob))
    predicted_cv = cross_val_predict(alg, dtrain, target, cv = 5, n_jobs = -1)
    print('AUC Score (CV):',metrics.roc_auc_score(target, predicted_cv))
    
    if plot:
        importanceVisualisation(pd.Series(alg.booster().get_fscore()), predictors)

In [27]:
predictors = newtrain.columns

## FIRST MODEL

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, newtrain, predictors)

## GRID SEARCH ON XGBOOST TO FIND THE BEST PARAMETERS (VERY LONG)

## Grid search on 'max_depth' and 'min_child_weight'

In [None]:
# param_test1 = {
#  'max_depth':np.array(range(3,10,2)),
#  'min_child_weight':np.array(range(1,6,2))
# }
# gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
#  min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
#  param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch1.fit(newtrain[predictors],target)
# gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

## Finer grid search on 'max_depth' and 'min_child_weight'

In [None]:
# param_test2 = {
#  'max_depth':[2,3,4],
#  'min_child_weight':[2,3,4]
# }
# gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
#  min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test2, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch2.fit(newtrain[predictors],target)
# gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

## Grid search on 'gamma'

In [None]:
# param_test3 = {
#  'gamma':[i/10.0 for i in range(0,5)]
# }
# gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
#  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test3, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch3.fit(newtrain[predictors],target)
# gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

## Grid search on 'subsample' and 'colsample_bytree'

In [None]:
# param_test4 = { 
#  'subsample':[i/10.0 for i in range(6,11)],
#  'colsample_bytree':[i/10.0 for i in range(6,11)]
# }
# gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
#  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test4, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch4.fit(newtrain[predictors],target)
# gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

## Grid search on 'reg_alpha'

In [None]:
# param_test6 = {
#  'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
# }
# gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
#  min_child_weight=4, gamma=0, subsample=1.0, colsample_bytree=0.6,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test6, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch6.fit(newtrain,target)
# gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

In [None]:
# param_test7 = {
#  'reg_alpha':[0.5,1,2,5,10]
# }
# gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
#  min_child_weight=4, gamma=0, subsample=1.0, colsample_bytree=0.6,
#  objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=27), 
#  param_grid = param_test7, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
# gsearch7.fit(newtrain,target)
# gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

## Grid search on 'learning_rate' and 'n_estimators'

In [209]:
param_test8 = {
 'learning_rate': [0.1,0.01,0.001],
 'n_estimators': [1000,5000,10000]
}
gsearch8 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=0, gamma=0.3, subsample=0.9, reg_alpha = 0.2, colsample_bytree=0.9,
 objective= 'binary:logistic', nthread=-1, scale_pos_weight=1,seed=40), 
 param_grid = param_test8, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
gsearch8.fit(newtrain,target)
gsearch8.grid_scores_, gsearch8.best_params_, gsearch8.best_score_

([mean: 0.80282, std: 0.00984, params: {'n_estimators': 1000, 'learning_rate': 0.1},
  mean: 0.80199, std: 0.01003, params: {'n_estimators': 5000, 'learning_rate': 0.1},
  mean: 0.80175, std: 0.01035, params: {'n_estimators': 10000, 'learning_rate': 0.1},
  mean: 0.81651, std: 0.00992, params: {'n_estimators': 1000, 'learning_rate': 0.01},
  mean: 0.80989, std: 0.00944, params: {'n_estimators': 5000, 'learning_rate': 0.01},
  mean: 0.80797, std: 0.00966, params: {'n_estimators': 10000, 'learning_rate': 0.01},
  mean: 0.81178, std: 0.00623, params: {'n_estimators': 1000, 'learning_rate': 0.001},
  mean: 0.81843, std: 0.00840, params: {'n_estimators': 5000, 'learning_rate': 0.001},
  mean: 0.81649, std: 0.00936, params: {'n_estimators': 10000, 'learning_rate': 0.001}],
 {'learning_rate': 0.001, 'n_estimators': 5000},
 0.8184288230374144)

## FINAL MODEL

In [28]:
# XgbParams = {'learning_rate': 0.001, 'n_estimators':10000, 'max_depth':6, 'min_child_weight':3, 'gamma':0,
#             'subsample':0.9, 'colsample_bytree':0.6, 'reg_alpha':1e-5, 'objective': 'binary:logistic',
#              'nthread':-1, 'scale_pos_weight':1}

XgbParams = {'learning_rate': 0.001, 'n_estimators':1000, 'max_depth':5, 'min_child_weight':0, 'gamma':0.3,
            'subsample':0.9, 'colsample_bytree':0.9, 'reg_alpha':0.2, 'objective': 'binary:logistic',
             'nthread':-1, 'scale_pos_weight':1}

# XgbParams = {'learning_rate': 0.01, 'n_estimators':1000, 'max_depth':4, 'min_child_weight':4, 'gamma':0,
#             'subsample':1, 'colsample_bytree':0.6, 'reg_alpha':1, 'objective': 'binary:logistic',
#              'nthread':-1, 'scale_pos_weight':1}

In [None]:
xgb3 = XGBClassifier(**XgbParams, seed=27)
modelfit(xgb3, newtrain, predictors, plot = False)

In [216]:
xgb3 = XGBClassifier(**XgbParams, seed=40)
modelfit(xgb3, newtrain, predictors, plot = False)


Model Report
Accuracy : 0.7944
AUC Score (Train): 0.885769
AUC Score (CV): 0.721147862905


<h1> VI. Final Prediction </h1> (We take the best xgboost model here)

In [41]:
gbrpred = gbr.predict(newtest)
df_output = pd.DataFrame()
df_output['sample_id'] = IDtest
df_output['is_listened'] = gbrpred
df_output[['sample_id','is_listened']].to_csv('./predictions/GBRoutput.csv', sep = ",", index=False)