In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [42]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import lightgbm as lgb
import nltk
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings

In [3]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [6]:
data_dir = Path("data")
feature_dir = Path("data")
val_dir = Path('data')
tst_dir = Path('data')
sub_dir = Path('data')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir/ 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [7]:
algo_name = 'lr'
feature_name = 'tfidf'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'


In [8]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [10]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


In [11]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer

In [12]:
s = trn.text[4]
print(s)

“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”


In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\anaconda3\envs\open source sw
[nltk_data]     class\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
tokens = word_tokenize(s)
print(tokens)

['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '....', 'Oh', ',', 'my', 'God', '!', '”']


In [19]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(t) for t in tokens]

['“',
 'Have',
 'mercy',
 ',',
 'gentleman',
 '!',
 '”',
 'odin',
 'flung',
 'up',
 'his',
 'hand',
 '.',
 '“',
 'Don',
 '’',
 't',
 'write',
 'that',
 ',',
 'anyway',
 ';',
 'have',
 'some',
 'shame',
 '.',
 'Here',
 'I',
 '’',
 've',
 'torn',
 'my',
 'heart',
 'asunder',
 'before',
 'you',
 ',',
 'and',
 'you',
 'seize',
 'the',
 'opportunity',
 'and',
 'are',
 'fingering',
 'the',
 'wound',
 'in',
 'both',
 'half',
 '....',
 'Oh',
 ',',
 'my',
 'God',
 '!',
 '”']

In [20]:
stemmer = SnowballStemmer("english")
[stemmer.stem(t) for t in tokens]

['“',
 'have',
 'merci',
 ',',
 'gentlemen',
 '!',
 '”',
 'odin',
 'flung',
 'up',
 'his',
 'hand',
 '.',
 '“',
 'don',
 '’',
 't',
 'write',
 'that',
 ',',
 'anyway',
 ';',
 'have',
 'some',
 'shame',
 '.',
 'here',
 'i',
 '’',
 've',
 'torn',
 'my',
 'heart',
 'asund',
 'befor',
 'you',
 ',',
 'and',
 'you',
 'seiz',
 'the',
 'opportun',
 'and',
 'are',
 'finger',
 'the',
 'wound',
 'in',
 'both',
 'halv',
 '....',
 'oh',
 ',',
 'my',
 'god',
 '!',
 '”']

In [21]:
vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)
X_cnt = vec.fit_transform(trn['text'])
print(X_cnt.shape)

(54879, 2685)


In [22]:
X_cnt[0, :50].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [36]:
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
X_trn = vec.fit_transform(trn['text'])
X_tst = vec.transform(tst['text'])
print(X.shape, X_tst.shape)

(54879, 5897) (19617, 5897)


In [24]:
X_trn[0, :50].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

In [25]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [26]:
y = trn.author.values
y.shape

(54879,)

In [52]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(X_trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=1000,
                             num_leaves=64,
                             learning_rate=0.1,
                             min_child_samples=10,
                             subsample=0.7,
                             subsample_freq=1,
                             colsample_bytree=0.7,
                             random_state=seed,
                             n_jobs=-1)
    clf.fit(X_trn[i_trn], y[i_trn],
            eval_set=[(X_trn[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    
    p_val[i_val, :] = clf.predict_proba(X_trn[i_val])
    p_tst += clf.predict_proba(X_tst) / n_fold

training model for CV #1
[1]	valid_0's multi_logloss: 1.56224
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 1.5544
[3]	valid_0's multi_logloss: 1.54705
[4]	valid_0's multi_logloss: 1.53952
[5]	valid_0's multi_logloss: 1.53223
[6]	valid_0's multi_logloss: 1.52512
[7]	valid_0's multi_logloss: 1.51821
[8]	valid_0's multi_logloss: 1.51139
[9]	valid_0's multi_logloss: 1.50501
[10]	valid_0's multi_logloss: 1.49828
[11]	valid_0's multi_logloss: 1.49162
[12]	valid_0's multi_logloss: 1.48538
[13]	valid_0's multi_logloss: 1.47902
[14]	valid_0's multi_logloss: 1.47263
[15]	valid_0's multi_logloss: 1.46654
[16]	valid_0's multi_logloss: 1.46031
[17]	valid_0's multi_logloss: 1.45457
[18]	valid_0's multi_logloss: 1.44871
[19]	valid_0's multi_logloss: 1.44303
[20]	valid_0's multi_logloss: 1.43717
[21]	valid_0's multi_logloss: 1.43149
[22]	valid_0's multi_logloss: 1.42576
[23]	valid_0's multi_logloss: 1.42029
[24]	valid_0's multi_logloss: 1.41479
[25]	valid_0

[212]	valid_0's multi_logloss: 0.950297
[213]	valid_0's multi_logloss: 0.949106
[214]	valid_0's multi_logloss: 0.947962
[215]	valid_0's multi_logloss: 0.946826
[216]	valid_0's multi_logloss: 0.945719
[217]	valid_0's multi_logloss: 0.944561
[218]	valid_0's multi_logloss: 0.943426
[219]	valid_0's multi_logloss: 0.942306
[220]	valid_0's multi_logloss: 0.941194
[221]	valid_0's multi_logloss: 0.940037
[222]	valid_0's multi_logloss: 0.93902
[223]	valid_0's multi_logloss: 0.93797
[224]	valid_0's multi_logloss: 0.93696
[225]	valid_0's multi_logloss: 0.935959
[226]	valid_0's multi_logloss: 0.934848
[227]	valid_0's multi_logloss: 0.933791
[228]	valid_0's multi_logloss: 0.932729
[229]	valid_0's multi_logloss: 0.931694
[230]	valid_0's multi_logloss: 0.930623
[231]	valid_0's multi_logloss: 0.929604
[232]	valid_0's multi_logloss: 0.928633
[233]	valid_0's multi_logloss: 0.927626
[234]	valid_0's multi_logloss: 0.926594
[235]	valid_0's multi_logloss: 0.925615
[236]	valid_0's multi_logloss: 0.924561
[23

[418]	valid_0's multi_logloss: 0.792339
[419]	valid_0's multi_logloss: 0.791832
[420]	valid_0's multi_logloss: 0.791362
[421]	valid_0's multi_logloss: 0.790825
[422]	valid_0's multi_logloss: 0.790348
[423]	valid_0's multi_logloss: 0.789805
[424]	valid_0's multi_logloss: 0.789303
[425]	valid_0's multi_logloss: 0.788824
[426]	valid_0's multi_logloss: 0.788314
[427]	valid_0's multi_logloss: 0.787814
[428]	valid_0's multi_logloss: 0.787302
[429]	valid_0's multi_logloss: 0.786732
[430]	valid_0's multi_logloss: 0.786204
[431]	valid_0's multi_logloss: 0.785647
[432]	valid_0's multi_logloss: 0.785173
[433]	valid_0's multi_logloss: 0.784698
[434]	valid_0's multi_logloss: 0.784237
[435]	valid_0's multi_logloss: 0.783776
[436]	valid_0's multi_logloss: 0.783223
[437]	valid_0's multi_logloss: 0.78273
[438]	valid_0's multi_logloss: 0.782264
[439]	valid_0's multi_logloss: 0.781798
[440]	valid_0's multi_logloss: 0.781343
[441]	valid_0's multi_logloss: 0.780894
[442]	valid_0's multi_logloss: 0.780451
[

[624]	valid_0's multi_logloss: 0.713655
[625]	valid_0's multi_logloss: 0.713386
[626]	valid_0's multi_logloss: 0.713109
[627]	valid_0's multi_logloss: 0.712832
[628]	valid_0's multi_logloss: 0.712601
[629]	valid_0's multi_logloss: 0.712332
[630]	valid_0's multi_logloss: 0.712059
[631]	valid_0's multi_logloss: 0.711794
[632]	valid_0's multi_logloss: 0.711564
[633]	valid_0's multi_logloss: 0.711313
[634]	valid_0's multi_logloss: 0.711054
[635]	valid_0's multi_logloss: 0.710787
[636]	valid_0's multi_logloss: 0.710525
[637]	valid_0's multi_logloss: 0.710261
[638]	valid_0's multi_logloss: 0.709998
[639]	valid_0's multi_logloss: 0.709718
[640]	valid_0's multi_logloss: 0.709442
[641]	valid_0's multi_logloss: 0.709175
[642]	valid_0's multi_logloss: 0.708923
[643]	valid_0's multi_logloss: 0.708661
[644]	valid_0's multi_logloss: 0.70841
[645]	valid_0's multi_logloss: 0.708164
[646]	valid_0's multi_logloss: 0.707924
[647]	valid_0's multi_logloss: 0.707676
[648]	valid_0's multi_logloss: 0.707415
[

[830]	valid_0's multi_logloss: 0.671046
[831]	valid_0's multi_logloss: 0.67091
[832]	valid_0's multi_logloss: 0.670764
[833]	valid_0's multi_logloss: 0.670593
[834]	valid_0's multi_logloss: 0.670425
[835]	valid_0's multi_logloss: 0.670304
[836]	valid_0's multi_logloss: 0.670162
[837]	valid_0's multi_logloss: 0.670023
[838]	valid_0's multi_logloss: 0.66987
[839]	valid_0's multi_logloss: 0.669729
[840]	valid_0's multi_logloss: 0.669609
[841]	valid_0's multi_logloss: 0.669458
[842]	valid_0's multi_logloss: 0.669311
[843]	valid_0's multi_logloss: 0.66914
[844]	valid_0's multi_logloss: 0.668997
[845]	valid_0's multi_logloss: 0.668871
[846]	valid_0's multi_logloss: 0.668747
[847]	valid_0's multi_logloss: 0.668612
[848]	valid_0's multi_logloss: 0.668484
[849]	valid_0's multi_logloss: 0.668322
[850]	valid_0's multi_logloss: 0.668182
[851]	valid_0's multi_logloss: 0.668009
[852]	valid_0's multi_logloss: 0.667878
[853]	valid_0's multi_logloss: 0.667754
[854]	valid_0's multi_logloss: 0.66759
[855

[33]	valid_0's multi_logloss: 1.3716
[34]	valid_0's multi_logloss: 1.36687
[35]	valid_0's multi_logloss: 1.36235
[36]	valid_0's multi_logloss: 1.35796
[37]	valid_0's multi_logloss: 1.35337
[38]	valid_0's multi_logloss: 1.34891
[39]	valid_0's multi_logloss: 1.3444
[40]	valid_0's multi_logloss: 1.34007
[41]	valid_0's multi_logloss: 1.33585
[42]	valid_0's multi_logloss: 1.33176
[43]	valid_0's multi_logloss: 1.32732
[44]	valid_0's multi_logloss: 1.3233
[45]	valid_0's multi_logloss: 1.31921
[46]	valid_0's multi_logloss: 1.31503
[47]	valid_0's multi_logloss: 1.3111
[48]	valid_0's multi_logloss: 1.3072
[49]	valid_0's multi_logloss: 1.30325
[50]	valid_0's multi_logloss: 1.29935
[51]	valid_0's multi_logloss: 1.29539
[52]	valid_0's multi_logloss: 1.29155
[53]	valid_0's multi_logloss: 1.2878
[54]	valid_0's multi_logloss: 1.28406
[55]	valid_0's multi_logloss: 1.28027
[56]	valid_0's multi_logloss: 1.2766
[57]	valid_0's multi_logloss: 1.27285
[58]	valid_0's multi_logloss: 1.26905
[59]	valid_0's mult

[244]	valid_0's multi_logloss: 0.927103
[245]	valid_0's multi_logloss: 0.926146
[246]	valid_0's multi_logloss: 0.925204
[247]	valid_0's multi_logloss: 0.924259
[248]	valid_0's multi_logloss: 0.923386
[249]	valid_0's multi_logloss: 0.922347
[250]	valid_0's multi_logloss: 0.921462
[251]	valid_0's multi_logloss: 0.920566
[252]	valid_0's multi_logloss: 0.919604
[253]	valid_0's multi_logloss: 0.918697
[254]	valid_0's multi_logloss: 0.917758
[255]	valid_0's multi_logloss: 0.916877
[256]	valid_0's multi_logloss: 0.915927
[257]	valid_0's multi_logloss: 0.915019
[258]	valid_0's multi_logloss: 0.914111
[259]	valid_0's multi_logloss: 0.913187
[260]	valid_0's multi_logloss: 0.912321
[261]	valid_0's multi_logloss: 0.911424
[262]	valid_0's multi_logloss: 0.910467
[263]	valid_0's multi_logloss: 0.909567
[264]	valid_0's multi_logloss: 0.908659
[265]	valid_0's multi_logloss: 0.907865
[266]	valid_0's multi_logloss: 0.906933
[267]	valid_0's multi_logloss: 0.90604
[268]	valid_0's multi_logloss: 0.905137
[

[450]	valid_0's multi_logloss: 0.789092
[451]	valid_0's multi_logloss: 0.788674
[452]	valid_0's multi_logloss: 0.788238
[453]	valid_0's multi_logloss: 0.787797
[454]	valid_0's multi_logloss: 0.787364
[455]	valid_0's multi_logloss: 0.7869
[456]	valid_0's multi_logloss: 0.786413
[457]	valid_0's multi_logloss: 0.786005
[458]	valid_0's multi_logloss: 0.785568
[459]	valid_0's multi_logloss: 0.785111
[460]	valid_0's multi_logloss: 0.784631
[461]	valid_0's multi_logloss: 0.784243
[462]	valid_0's multi_logloss: 0.78377
[463]	valid_0's multi_logloss: 0.783258
[464]	valid_0's multi_logloss: 0.782843
[465]	valid_0's multi_logloss: 0.782416
[466]	valid_0's multi_logloss: 0.781953
[467]	valid_0's multi_logloss: 0.78149
[468]	valid_0's multi_logloss: 0.781057
[469]	valid_0's multi_logloss: 0.78058
[470]	valid_0's multi_logloss: 0.780101
[471]	valid_0's multi_logloss: 0.779686
[472]	valid_0's multi_logloss: 0.779278
[473]	valid_0's multi_logloss: 0.778823
[474]	valid_0's multi_logloss: 0.77841
[475]	

[656]	valid_0's multi_logloss: 0.718202
[657]	valid_0's multi_logloss: 0.717892
[658]	valid_0's multi_logloss: 0.717643
[659]	valid_0's multi_logloss: 0.717396
[660]	valid_0's multi_logloss: 0.717162
[661]	valid_0's multi_logloss: 0.716909
[662]	valid_0's multi_logloss: 0.716658
[663]	valid_0's multi_logloss: 0.716412
[664]	valid_0's multi_logloss: 0.716172
[665]	valid_0's multi_logloss: 0.71596
[666]	valid_0's multi_logloss: 0.715706
[667]	valid_0's multi_logloss: 0.715479
[668]	valid_0's multi_logloss: 0.715196
[669]	valid_0's multi_logloss: 0.714962
[670]	valid_0's multi_logloss: 0.714717
[671]	valid_0's multi_logloss: 0.714477
[672]	valid_0's multi_logloss: 0.714196
[673]	valid_0's multi_logloss: 0.713945
[674]	valid_0's multi_logloss: 0.713727
[675]	valid_0's multi_logloss: 0.713484
[676]	valid_0's multi_logloss: 0.713262
[677]	valid_0's multi_logloss: 0.713036
[678]	valid_0's multi_logloss: 0.712809
[679]	valid_0's multi_logloss: 0.712578
[680]	valid_0's multi_logloss: 0.712337
[

[862]	valid_0's multi_logloss: 0.67863
[863]	valid_0's multi_logloss: 0.678497
[864]	valid_0's multi_logloss: 0.678378
[865]	valid_0's multi_logloss: 0.678219
[866]	valid_0's multi_logloss: 0.678061
[867]	valid_0's multi_logloss: 0.677908
[868]	valid_0's multi_logloss: 0.677766
[869]	valid_0's multi_logloss: 0.677608
[870]	valid_0's multi_logloss: 0.677456
[871]	valid_0's multi_logloss: 0.677288
[872]	valid_0's multi_logloss: 0.677165
[873]	valid_0's multi_logloss: 0.677009
[874]	valid_0's multi_logloss: 0.676858
[875]	valid_0's multi_logloss: 0.676735
[876]	valid_0's multi_logloss: 0.676584
[877]	valid_0's multi_logloss: 0.676458
[878]	valid_0's multi_logloss: 0.676311
[879]	valid_0's multi_logloss: 0.676176
[880]	valid_0's multi_logloss: 0.676055
[881]	valid_0's multi_logloss: 0.675932
[882]	valid_0's multi_logloss: 0.675783
[883]	valid_0's multi_logloss: 0.675629
[884]	valid_0's multi_logloss: 0.67549
[885]	valid_0's multi_logloss: 0.675405
[886]	valid_0's multi_logloss: 0.675278
[8

[67]	valid_0's multi_logloss: 1.23459
[68]	valid_0's multi_logloss: 1.2312
[69]	valid_0's multi_logloss: 1.22783
[70]	valid_0's multi_logloss: 1.22461
[71]	valid_0's multi_logloss: 1.22132
[72]	valid_0's multi_logloss: 1.21824
[73]	valid_0's multi_logloss: 1.21535
[74]	valid_0's multi_logloss: 1.21219
[75]	valid_0's multi_logloss: 1.20912
[76]	valid_0's multi_logloss: 1.20599
[77]	valid_0's multi_logloss: 1.20306
[78]	valid_0's multi_logloss: 1.20015
[79]	valid_0's multi_logloss: 1.19727
[80]	valid_0's multi_logloss: 1.19437
[81]	valid_0's multi_logloss: 1.19136
[82]	valid_0's multi_logloss: 1.18853
[83]	valid_0's multi_logloss: 1.18559
[84]	valid_0's multi_logloss: 1.18251
[85]	valid_0's multi_logloss: 1.1798
[86]	valid_0's multi_logloss: 1.17702
[87]	valid_0's multi_logloss: 1.17431
[88]	valid_0's multi_logloss: 1.17146
[89]	valid_0's multi_logloss: 1.16883
[90]	valid_0's multi_logloss: 1.16611
[91]	valid_0's multi_logloss: 1.16351
[92]	valid_0's multi_logloss: 1.1608
[93]	valid_0's 

[276]	valid_0's multi_logloss: 0.891661
[277]	valid_0's multi_logloss: 0.890822
[278]	valid_0's multi_logloss: 0.890013
[279]	valid_0's multi_logloss: 0.889206
[280]	valid_0's multi_logloss: 0.888319
[281]	valid_0's multi_logloss: 0.887496
[282]	valid_0's multi_logloss: 0.886678
[283]	valid_0's multi_logloss: 0.885928
[284]	valid_0's multi_logloss: 0.885091
[285]	valid_0's multi_logloss: 0.884336
[286]	valid_0's multi_logloss: 0.883591
[287]	valid_0's multi_logloss: 0.88283
[288]	valid_0's multi_logloss: 0.881894
[289]	valid_0's multi_logloss: 0.88105
[290]	valid_0's multi_logloss: 0.880237
[291]	valid_0's multi_logloss: 0.879413
[292]	valid_0's multi_logloss: 0.878696
[293]	valid_0's multi_logloss: 0.877909
[294]	valid_0's multi_logloss: 0.87711
[295]	valid_0's multi_logloss: 0.876347
[296]	valid_0's multi_logloss: 0.875486
[297]	valid_0's multi_logloss: 0.874737
[298]	valid_0's multi_logloss: 0.873935
[299]	valid_0's multi_logloss: 0.873121
[300]	valid_0's multi_logloss: 0.872296
[30

[482]	valid_0's multi_logloss: 0.766493
[483]	valid_0's multi_logloss: 0.766045
[484]	valid_0's multi_logloss: 0.765614
[485]	valid_0's multi_logloss: 0.765226
[486]	valid_0's multi_logloss: 0.764825
[487]	valid_0's multi_logloss: 0.764386
[488]	valid_0's multi_logloss: 0.764016
[489]	valid_0's multi_logloss: 0.763638
[490]	valid_0's multi_logloss: 0.763233
[491]	valid_0's multi_logloss: 0.762795
[492]	valid_0's multi_logloss: 0.762337
[493]	valid_0's multi_logloss: 0.761894
[494]	valid_0's multi_logloss: 0.761437
[495]	valid_0's multi_logloss: 0.760988
[496]	valid_0's multi_logloss: 0.760613
[497]	valid_0's multi_logloss: 0.76022
[498]	valid_0's multi_logloss: 0.759757
[499]	valid_0's multi_logloss: 0.759331
[500]	valid_0's multi_logloss: 0.75895
[501]	valid_0's multi_logloss: 0.758485
[502]	valid_0's multi_logloss: 0.758071
[503]	valid_0's multi_logloss: 0.757671
[504]	valid_0's multi_logloss: 0.757252
[505]	valid_0's multi_logloss: 0.756855
[506]	valid_0's multi_logloss: 0.756501
[5

[688]	valid_0's multi_logloss: 0.701456
[689]	valid_0's multi_logloss: 0.701193
[690]	valid_0's multi_logloss: 0.700945
[691]	valid_0's multi_logloss: 0.700718
[692]	valid_0's multi_logloss: 0.70047
[693]	valid_0's multi_logloss: 0.700193
[694]	valid_0's multi_logloss: 0.699929
[695]	valid_0's multi_logloss: 0.699686
[696]	valid_0's multi_logloss: 0.699454
[697]	valid_0's multi_logloss: 0.699248
[698]	valid_0's multi_logloss: 0.699013
[699]	valid_0's multi_logloss: 0.698784
[700]	valid_0's multi_logloss: 0.698555
[701]	valid_0's multi_logloss: 0.698322
[702]	valid_0's multi_logloss: 0.698092
[703]	valid_0's multi_logloss: 0.69791
[704]	valid_0's multi_logloss: 0.697699
[705]	valid_0's multi_logloss: 0.697502
[706]	valid_0's multi_logloss: 0.697282
[707]	valid_0's multi_logloss: 0.697047
[708]	valid_0's multi_logloss: 0.696837
[709]	valid_0's multi_logloss: 0.696585
[710]	valid_0's multi_logloss: 0.696389
[711]	valid_0's multi_logloss: 0.696203
[712]	valid_0's multi_logloss: 0.696003
[7

[894]	valid_0's multi_logloss: 0.665267
[895]	valid_0's multi_logloss: 0.665139
[896]	valid_0's multi_logloss: 0.665007
[897]	valid_0's multi_logloss: 0.66489
[898]	valid_0's multi_logloss: 0.66479
[899]	valid_0's multi_logloss: 0.664671
[900]	valid_0's multi_logloss: 0.66457
[901]	valid_0's multi_logloss: 0.664432
[902]	valid_0's multi_logloss: 0.664316
[903]	valid_0's multi_logloss: 0.664167
[904]	valid_0's multi_logloss: 0.664054
[905]	valid_0's multi_logloss: 0.663925
[906]	valid_0's multi_logloss: 0.66379
[907]	valid_0's multi_logloss: 0.663686
[908]	valid_0's multi_logloss: 0.663542
[909]	valid_0's multi_logloss: 0.663434
[910]	valid_0's multi_logloss: 0.663316
[911]	valid_0's multi_logloss: 0.663217
[912]	valid_0's multi_logloss: 0.663078
[913]	valid_0's multi_logloss: 0.662969
[914]	valid_0's multi_logloss: 0.662845
[915]	valid_0's multi_logloss: 0.662734
[916]	valid_0's multi_logloss: 0.662626
[917]	valid_0's multi_logloss: 0.662508
[918]	valid_0's multi_logloss: 0.662404
[919

[101]	valid_0's multi_logloss: 1.13846
[102]	valid_0's multi_logloss: 1.13613
[103]	valid_0's multi_logloss: 1.13379
[104]	valid_0's multi_logloss: 1.13139
[105]	valid_0's multi_logloss: 1.12895
[106]	valid_0's multi_logloss: 1.12659
[107]	valid_0's multi_logloss: 1.12419
[108]	valid_0's multi_logloss: 1.12187
[109]	valid_0's multi_logloss: 1.11967
[110]	valid_0's multi_logloss: 1.11748
[111]	valid_0's multi_logloss: 1.11529
[112]	valid_0's multi_logloss: 1.11302
[113]	valid_0's multi_logloss: 1.11086
[114]	valid_0's multi_logloss: 1.10863
[115]	valid_0's multi_logloss: 1.10651
[116]	valid_0's multi_logloss: 1.10448
[117]	valid_0's multi_logloss: 1.10235
[118]	valid_0's multi_logloss: 1.10023
[119]	valid_0's multi_logloss: 1.09808
[120]	valid_0's multi_logloss: 1.09602
[121]	valid_0's multi_logloss: 1.09399
[122]	valid_0's multi_logloss: 1.09193
[123]	valid_0's multi_logloss: 1.08987
[124]	valid_0's multi_logloss: 1.08785
[125]	valid_0's multi_logloss: 1.08579
[126]	valid_0's multi_log

[309]	valid_0's multi_logloss: 0.86575
[310]	valid_0's multi_logloss: 0.864978
[311]	valid_0's multi_logloss: 0.864162
[312]	valid_0's multi_logloss: 0.863371
[313]	valid_0's multi_logloss: 0.862639
[314]	valid_0's multi_logloss: 0.861921
[315]	valid_0's multi_logloss: 0.861148
[316]	valid_0's multi_logloss: 0.860347
[317]	valid_0's multi_logloss: 0.859608
[318]	valid_0's multi_logloss: 0.858859
[319]	valid_0's multi_logloss: 0.85814
[320]	valid_0's multi_logloss: 0.857373
[321]	valid_0's multi_logloss: 0.856688
[322]	valid_0's multi_logloss: 0.855911
[323]	valid_0's multi_logloss: 0.855169
[324]	valid_0's multi_logloss: 0.854429
[325]	valid_0's multi_logloss: 0.853815
[326]	valid_0's multi_logloss: 0.853067
[327]	valid_0's multi_logloss: 0.852303
[328]	valid_0's multi_logloss: 0.851589
[329]	valid_0's multi_logloss: 0.850828
[330]	valid_0's multi_logloss: 0.850113
[331]	valid_0's multi_logloss: 0.849378
[332]	valid_0's multi_logloss: 0.848733
[333]	valid_0's multi_logloss: 0.848075
[3

[515]	valid_0's multi_logloss: 0.754419
[516]	valid_0's multi_logloss: 0.75404
[517]	valid_0's multi_logloss: 0.753669
[518]	valid_0's multi_logloss: 0.753269
[519]	valid_0's multi_logloss: 0.75292
[520]	valid_0's multi_logloss: 0.752526
[521]	valid_0's multi_logloss: 0.752167
[522]	valid_0's multi_logloss: 0.751765
[523]	valid_0's multi_logloss: 0.751377
[524]	valid_0's multi_logloss: 0.751045
[525]	valid_0's multi_logloss: 0.750684
[526]	valid_0's multi_logloss: 0.750313
[527]	valid_0's multi_logloss: 0.749926
[528]	valid_0's multi_logloss: 0.749543
[529]	valid_0's multi_logloss: 0.749178
[530]	valid_0's multi_logloss: 0.748842
[531]	valid_0's multi_logloss: 0.748455
[532]	valid_0's multi_logloss: 0.748053
[533]	valid_0's multi_logloss: 0.747675
[534]	valid_0's multi_logloss: 0.747326
[535]	valid_0's multi_logloss: 0.746974
[536]	valid_0's multi_logloss: 0.746622
[537]	valid_0's multi_logloss: 0.746228
[538]	valid_0's multi_logloss: 0.745851
[539]	valid_0's multi_logloss: 0.745502
[5

[721]	valid_0's multi_logloss: 0.69687
[722]	valid_0's multi_logloss: 0.696652
[723]	valid_0's multi_logloss: 0.69642
[724]	valid_0's multi_logloss: 0.696243
[725]	valid_0's multi_logloss: 0.696059
[726]	valid_0's multi_logloss: 0.695817
[727]	valid_0's multi_logloss: 0.695589
[728]	valid_0's multi_logloss: 0.695398
[729]	valid_0's multi_logloss: 0.695207
[730]	valid_0's multi_logloss: 0.695011
[731]	valid_0's multi_logloss: 0.69482
[732]	valid_0's multi_logloss: 0.694636
[733]	valid_0's multi_logloss: 0.694429
[734]	valid_0's multi_logloss: 0.694237
[735]	valid_0's multi_logloss: 0.694023
[736]	valid_0's multi_logloss: 0.693837
[737]	valid_0's multi_logloss: 0.693622
[738]	valid_0's multi_logloss: 0.693471
[739]	valid_0's multi_logloss: 0.693242
[740]	valid_0's multi_logloss: 0.693034
[741]	valid_0's multi_logloss: 0.692818
[742]	valid_0's multi_logloss: 0.692657
[743]	valid_0's multi_logloss: 0.692484
[744]	valid_0's multi_logloss: 0.692301
[745]	valid_0's multi_logloss: 0.692098
[74

[927]	valid_0's multi_logloss: 0.663919
[928]	valid_0's multi_logloss: 0.663795
[929]	valid_0's multi_logloss: 0.663663
[930]	valid_0's multi_logloss: 0.663533
[931]	valid_0's multi_logloss: 0.663407
[932]	valid_0's multi_logloss: 0.663293
[933]	valid_0's multi_logloss: 0.663169
[934]	valid_0's multi_logloss: 0.663052
[935]	valid_0's multi_logloss: 0.662908
[936]	valid_0's multi_logloss: 0.662788
[937]	valid_0's multi_logloss: 0.662663
[938]	valid_0's multi_logloss: 0.662565
[939]	valid_0's multi_logloss: 0.662459
[940]	valid_0's multi_logloss: 0.662347
[941]	valid_0's multi_logloss: 0.662233
[942]	valid_0's multi_logloss: 0.662104
[943]	valid_0's multi_logloss: 0.661994
[944]	valid_0's multi_logloss: 0.661878
[945]	valid_0's multi_logloss: 0.661751
[946]	valid_0's multi_logloss: 0.661667
[947]	valid_0's multi_logloss: 0.661577
[948]	valid_0's multi_logloss: 0.661456
[949]	valid_0's multi_logloss: 0.661346
[950]	valid_0's multi_logloss: 0.661215
[951]	valid_0's multi_logloss: 0.661118


[134]	valid_0's multi_logloss: 1.05994
[135]	valid_0's multi_logloss: 1.05798
[136]	valid_0's multi_logloss: 1.05606
[137]	valid_0's multi_logloss: 1.05427
[138]	valid_0's multi_logloss: 1.05241
[139]	valid_0's multi_logloss: 1.05062
[140]	valid_0's multi_logloss: 1.04867
[141]	valid_0's multi_logloss: 1.04682
[142]	valid_0's multi_logloss: 1.04503
[143]	valid_0's multi_logloss: 1.04329
[144]	valid_0's multi_logloss: 1.04151
[145]	valid_0's multi_logloss: 1.03959
[146]	valid_0's multi_logloss: 1.03785
[147]	valid_0's multi_logloss: 1.03605
[148]	valid_0's multi_logloss: 1.03425
[149]	valid_0's multi_logloss: 1.03256
[150]	valid_0's multi_logloss: 1.03087
[151]	valid_0's multi_logloss: 1.02916
[152]	valid_0's multi_logloss: 1.02751
[153]	valid_0's multi_logloss: 1.02595
[154]	valid_0's multi_logloss: 1.02416
[155]	valid_0's multi_logloss: 1.0225
[156]	valid_0's multi_logloss: 1.02078
[157]	valid_0's multi_logloss: 1.01912
[158]	valid_0's multi_logloss: 1.01757
[159]	valid_0's multi_logl

[341]	valid_0's multi_logloss: 0.831324
[342]	valid_0's multi_logloss: 0.830655
[343]	valid_0's multi_logloss: 0.829998
[344]	valid_0's multi_logloss: 0.829301
[345]	valid_0's multi_logloss: 0.828624
[346]	valid_0's multi_logloss: 0.827973
[347]	valid_0's multi_logloss: 0.827295
[348]	valid_0's multi_logloss: 0.826662
[349]	valid_0's multi_logloss: 0.826026
[350]	valid_0's multi_logloss: 0.825396
[351]	valid_0's multi_logloss: 0.824728
[352]	valid_0's multi_logloss: 0.824047
[353]	valid_0's multi_logloss: 0.823381
[354]	valid_0's multi_logloss: 0.822696
[355]	valid_0's multi_logloss: 0.822087
[356]	valid_0's multi_logloss: 0.821456
[357]	valid_0's multi_logloss: 0.820832
[358]	valid_0's multi_logloss: 0.820235
[359]	valid_0's multi_logloss: 0.819591
[360]	valid_0's multi_logloss: 0.818991
[361]	valid_0's multi_logloss: 0.81836
[362]	valid_0's multi_logloss: 0.817699
[363]	valid_0's multi_logloss: 0.817079
[364]	valid_0's multi_logloss: 0.816434
[365]	valid_0's multi_logloss: 0.815834
[

[547]	valid_0's multi_logloss: 0.730933
[548]	valid_0's multi_logloss: 0.730598
[549]	valid_0's multi_logloss: 0.730229
[550]	valid_0's multi_logloss: 0.729866
[551]	valid_0's multi_logloss: 0.729572
[552]	valid_0's multi_logloss: 0.729238
[553]	valid_0's multi_logloss: 0.728919
[554]	valid_0's multi_logloss: 0.728586
[555]	valid_0's multi_logloss: 0.728319
[556]	valid_0's multi_logloss: 0.727947
[557]	valid_0's multi_logloss: 0.72761
[558]	valid_0's multi_logloss: 0.727284
[559]	valid_0's multi_logloss: 0.726934
[560]	valid_0's multi_logloss: 0.726566
[561]	valid_0's multi_logloss: 0.726264
[562]	valid_0's multi_logloss: 0.725958
[563]	valid_0's multi_logloss: 0.725643
[564]	valid_0's multi_logloss: 0.725334
[565]	valid_0's multi_logloss: 0.724965
[566]	valid_0's multi_logloss: 0.724648
[567]	valid_0's multi_logloss: 0.724343
[568]	valid_0's multi_logloss: 0.724026
[569]	valid_0's multi_logloss: 0.723719
[570]	valid_0's multi_logloss: 0.723363
[571]	valid_0's multi_logloss: 0.723086
[

[753]	valid_0's multi_logloss: 0.678355
[754]	valid_0's multi_logloss: 0.678169
[755]	valid_0's multi_logloss: 0.677982
[756]	valid_0's multi_logloss: 0.677805
[757]	valid_0's multi_logloss: 0.67759
[758]	valid_0's multi_logloss: 0.677414
[759]	valid_0's multi_logloss: 0.677206
[760]	valid_0's multi_logloss: 0.677023
[761]	valid_0's multi_logloss: 0.676838
[762]	valid_0's multi_logloss: 0.676665
[763]	valid_0's multi_logloss: 0.676489
[764]	valid_0's multi_logloss: 0.676277
[765]	valid_0's multi_logloss: 0.676107
[766]	valid_0's multi_logloss: 0.675931
[767]	valid_0's multi_logloss: 0.675744
[768]	valid_0's multi_logloss: 0.675552
[769]	valid_0's multi_logloss: 0.675383
[770]	valid_0's multi_logloss: 0.675193
[771]	valid_0's multi_logloss: 0.674998
[772]	valid_0's multi_logloss: 0.674816
[773]	valid_0's multi_logloss: 0.674601
[774]	valid_0's multi_logloss: 0.674434
[775]	valid_0's multi_logloss: 0.674264
[776]	valid_0's multi_logloss: 0.674079
[777]	valid_0's multi_logloss: 0.673891
[

[959]	valid_0's multi_logloss: 0.648156
[960]	valid_0's multi_logloss: 0.648042
[961]	valid_0's multi_logloss: 0.647918
[962]	valid_0's multi_logloss: 0.647809
[963]	valid_0's multi_logloss: 0.647666
[964]	valid_0's multi_logloss: 0.64756
[965]	valid_0's multi_logloss: 0.647431
[966]	valid_0's multi_logloss: 0.647323
[967]	valid_0's multi_logloss: 0.64723
[968]	valid_0's multi_logloss: 0.647131
[969]	valid_0's multi_logloss: 0.64703
[970]	valid_0's multi_logloss: 0.646926
[971]	valid_0's multi_logloss: 0.646812
[972]	valid_0's multi_logloss: 0.646706
[973]	valid_0's multi_logloss: 0.646605
[974]	valid_0's multi_logloss: 0.6465
[975]	valid_0's multi_logloss: 0.646392
[976]	valid_0's multi_logloss: 0.646293
[977]	valid_0's multi_logloss: 0.646191
[978]	valid_0's multi_logloss: 0.646088
[979]	valid_0's multi_logloss: 0.645969
[980]	valid_0's multi_logloss: 0.64581
[981]	valid_0's multi_logloss: 0.645704
[982]	valid_0's multi_logloss: 0.645578
[983]	valid_0's multi_logloss: 0.645492
[984]	

In [53]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

Accuracy (CV):  76.1439%
Log Loss (CV):   0.6528


In [54]:
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [55]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [56]:
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.046,0.7333,0.1628,0.0472,0.0108
1,0.0268,0.8369,0.0208,0.0371,0.0784
2,0.9395,0.0151,0.0183,0.0069,0.0201
3,0.0188,0.0285,0.9049,0.0327,0.0151
4,0.4065,0.1308,0.2014,0.1216,0.1398


In [57]:
sub.to_csv(sub_file)