# <center> PostProcess </center>
<center> Josh Wilkins <br> 10/13/2017 </center>

In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
%matplotlib inline

import lightgbm as lgb

In [2]:
# Collect data, keep best columns

data_types =\
    {'artist_name' : 'category',
     'cc' : 'category',
     'city' : 'category',
     'composer' : 'category',
     'days' : np.float,
     'genre_ids': 'category',
     'language' : 'category',
     'msno' : 'category',
     'song_id' : 'category', 
     'song_length' : np.float,
     'source_screen_name' : 'category',
     'source_system_tab' : 'category',
     'source_type' : 'category',
     'xxx' : 'category',
     'yyyy' : np.float,
     'target' : np.uint8}

cols = ['cc', 'xxx', 'lyricist_count', 'composer_count', 'genre_ids_count']

train = pd.read_csv('Data/TrainFormatted.csv', usecols=cols, dtype=data_types)
test = pd.read_csv('Data/TestFormatted.csv', usecols=cols, dtype=data_types)

In [8]:
# LGB
from sklearn.model_selection import KFold
import lightgbm as lgb

# Create a Cross Validation with 3 splits
nsplit = 3
kf = KFold(n_splits=nsplit, shuffle=False)

# This array will store the predictions made.
predictions = np.zeros(shape=[len(test)])

# For each KFold
i = 0
for train_indices ,validate_indices in kf.split(train) : 
    i = i+1
    train_data = lgb.Dataset(train.drop(['target'],axis=1).loc[train_indices,:],label=train.loc[train_indices,'target'])
    val_data = lgb.Dataset(train.drop(['target'],axis=1).loc[validate_indices,:],label=train.loc[validate_indices,'target'])
    
    # Create the parameters for LGBM
    params = {
        'objective': 'binary',
        'learning_rate': 0.1 ,
        'verbose': 0,
        'num_leaves': 150,
        'num_rounds': 200,
        'metric' : 'auc',
        }
    
    # Train the model
    bst = lgb.train(params, train_data, 100, valid_sets=[val_data])
    
    # Make the predictions storing them on the predictions array
    predictions += bst.predict(test.drop(['id'], axis=1))
    
    # Release the model from memory for the next iteration
    if i!=3:
        del bst

print('Training process finished. Generating Output...')

# We get the ammount of predictions from the prediction list, by dividing the predictions by the number of Kfolds.
predictions = predictions/nsplit

[1]	valid_0's auc: 0.762017
[2]	valid_0's auc: 0.767461
[3]	valid_0's auc: 0.770671
[4]	valid_0's auc: 0.772597
[5]	valid_0's auc: 0.774414
[6]	valid_0's auc: 0.775607
[7]	valid_0's auc: 0.776486
[8]	valid_0's auc: 0.778244
[9]	valid_0's auc: 0.778761
[10]	valid_0's auc: 0.779539
[11]	valid_0's auc: 0.78143
[12]	valid_0's auc: 0.782747
[13]	valid_0's auc: 0.783627
[14]	valid_0's auc: 0.784548
[15]	valid_0's auc: 0.785753
[16]	valid_0's auc: 0.787185
[17]	valid_0's auc: 0.787965
[18]	valid_0's auc: 0.788752
[19]	valid_0's auc: 0.789334
[20]	valid_0's auc: 0.789857
[21]	valid_0's auc: 0.790234
[22]	valid_0's auc: 0.790518
[23]	valid_0's auc: 0.790776
[24]	valid_0's auc: 0.790985
[25]	valid_0's auc: 0.791121
[26]	valid_0's auc: 0.791212
[27]	valid_0's auc: 0.791265
[28]	valid_0's auc: 0.791306
[29]	valid_0's auc: 0.791375
[30]	valid_0's auc: 0.79148
[31]	valid_0's auc: 0.791456
[32]	valid_0's auc: 0.791548
[33]	valid_0's auc: 0.791604
[34]	valid_0's auc: 0.791641
[35]	valid_0's auc: 0.791

[82]	valid_0's auc: 0.762999
[83]	valid_0's auc: 0.763012
[84]	valid_0's auc: 0.763022
[85]	valid_0's auc: 0.763028
[86]	valid_0's auc: 0.763043
[87]	valid_0's auc: 0.763062
[88]	valid_0's auc: 0.76309
[89]	valid_0's auc: 0.763114
[90]	valid_0's auc: 0.763128
[91]	valid_0's auc: 0.763126
[92]	valid_0's auc: 0.76313
[93]	valid_0's auc: 0.763142
[94]	valid_0's auc: 0.76314
[95]	valid_0's auc: 0.763151
[96]	valid_0's auc: 0.763187
[97]	valid_0's auc: 0.763193
[98]	valid_0's auc: 0.763231
[99]	valid_0's auc: 0.763233
[100]	valid_0's auc: 0.763258
[101]	valid_0's auc: 0.763272
[102]	valid_0's auc: 0.763276
[103]	valid_0's auc: 0.763277
[104]	valid_0's auc: 0.763301
[105]	valid_0's auc: 0.763324
[106]	valid_0's auc: 0.763333
[107]	valid_0's auc: 0.763324
[108]	valid_0's auc: 0.763329
[109]	valid_0's auc: 0.763329
[110]	valid_0's auc: 0.76334
[111]	valid_0's auc: 0.763352
[112]	valid_0's auc: 0.76335
[113]	valid_0's auc: 0.763352
[114]	valid_0's auc: 0.763358
[115]	valid_0's auc: 0.763368
[11

[161]	valid_0's auc: 0.732148
[162]	valid_0's auc: 0.732148
[163]	valid_0's auc: 0.732159
[164]	valid_0's auc: 0.732159
[165]	valid_0's auc: 0.732159
[166]	valid_0's auc: 0.73217
[167]	valid_0's auc: 0.732183
[168]	valid_0's auc: 0.7322
[169]	valid_0's auc: 0.732211
[170]	valid_0's auc: 0.732212
[171]	valid_0's auc: 0.732241
[172]	valid_0's auc: 0.73225
[173]	valid_0's auc: 0.732248
[174]	valid_0's auc: 0.73225
[175]	valid_0's auc: 0.732264
[176]	valid_0's auc: 0.732283
[177]	valid_0's auc: 0.732305
[178]	valid_0's auc: 0.732307
[179]	valid_0's auc: 0.732316
[180]	valid_0's auc: 0.732326
[181]	valid_0's auc: 0.732334
[182]	valid_0's auc: 0.73234
[183]	valid_0's auc: 0.732343
[184]	valid_0's auc: 0.732347
[185]	valid_0's auc: 0.732359
[186]	valid_0's auc: 0.732371
[187]	valid_0's auc: 0.732388
[188]	valid_0's auc: 0.7324
[189]	valid_0's auc: 0.732403
[190]	valid_0's auc: 0.732424
[191]	valid_0's auc: 0.732424
[192]	valid_0's auc: 0.732446
[193]	valid_0's auc: 0.73244
[194]	valid_0's auc

[41]	valid_0's auc: 0.68125
[42]	valid_0's auc: 0.681481
[43]	valid_0's auc: 0.681604
[44]	valid_0's auc: 0.681771
[45]	valid_0's auc: 0.681953
[46]	valid_0's auc: 0.682096
[47]	valid_0's auc: 0.682229
[48]	valid_0's auc: 0.682351
[49]	valid_0's auc: 0.682473
[50]	valid_0's auc: 0.682576
[51]	valid_0's auc: 0.682675
[52]	valid_0's auc: 0.682752
[53]	valid_0's auc: 0.682816
[54]	valid_0's auc: 0.682926
[55]	valid_0's auc: 0.683016
[56]	valid_0's auc: 0.683102
[57]	valid_0's auc: 0.683179
[58]	valid_0's auc: 0.683261
[59]	valid_0's auc: 0.683316
[60]	valid_0's auc: 0.683461
[61]	valid_0's auc: 0.683512
[62]	valid_0's auc: 0.683542
[63]	valid_0's auc: 0.6836
[64]	valid_0's auc: 0.683754
[65]	valid_0's auc: 0.683797
[66]	valid_0's auc: 0.683851
[67]	valid_0's auc: 0.683881
[68]	valid_0's auc: 0.684005
[69]	valid_0's auc: 0.684087
[70]	valid_0's auc: 0.684121
[71]	valid_0's auc: 0.684186
[72]	valid_0's auc: 0.684233
[73]	valid_0's auc: 0.684255
[74]	valid_0's auc: 0.684296
[75]	valid_0's au

In [9]:
# Create Submission File

# Read the sample_submission CSV
submission = pd.read_csv('Data/sample_submission.csv')

# Set the target to our predictions
submission.target=predictions

# Save the submission file
submission.to_csv('submission.csv',index=False)

print('Output created.')

Output created.
