Otto Grop product classification project from [kaggle competition](https://www.kaggle.com/c/otto-group-product-classification-challenge/)

## Preprocessing

In [1]:
import lightgbm

In [2]:
import pandas as pd
import numpy as np


In [3]:
train = pd.read_csv('data/train.csv', index_col = 'id')
X_test = pd.read_csv('data/test.csv', index_col = 'id')

print(train.shape, X_test.shape)

(61878, 94) (144368, 93)


In [4]:
train.head()

Unnamed: 0_level_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
4,1,0,0,1,6,1,5,0,0,1,...,0,1,2,0,0,0,0,0,0,Class_1
5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [5]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Fitting and Hyperparameter tuning

In [7]:
from lightgbm import LGBMClassifier
model = LGBMClassifier()
model.fit(X_train, y_train)

LGBMClassifier()

In [10]:
from lightgbm import LGBMClassifier

random_state = 42

model = LGBMClassifier(random_state=random_state)
model.fit(X_train, y_train)

LGBMClassifier(random_state=42)

In [11]:
y_pred = model.predict(X_val)
(y_val == y_pred).mean()

0.801147382029735

### Coarse Search

In [25]:
from datetime import datetime
random_state = 42

today = str(datetime.now())
filename = f'coarse-search {today}.csv'

num_loop = 100
n_estimators = 1000

early_stopping_rounds = 20
coarse_hyperparameters_list = []

for loop in range(num_loop):
    num_leaves = np.random.randint(10, 200)
    min_child_samples = np.random.randint(2, 200)
    subsample = np.random.uniform(0.1, 1.0)
    colsample_bytree = np.random.uniform(0.1, 1.0)
    learning_rate = 10 ** -np.random.uniform(low = 1, high = 10)
    reg_alpha = 10 ** -np.random.uniform(low = 1, high = 10)
    reg_lambda = 10 **  -np.random.uniform(low = 1, high = 15)
    
    parameters = {'loop': loop,
                 'num_leaves': num_leaves,
                 'min_child_samples': min_child_samples,
                 'subsample': subsample,
                 'colsample_bytree': colsample_bytree,
                 'reg_alpha' : reg_alpha,
                 'reg_lambda': reg_lambda,
                 'n_estimators': n_estimators,
                 'learning_rate': learning_rate,
                 'random_state': random_state}
    model = LGBMClassifier(**parameters)
    
    model.fit(X_train, y_train, 
             eval_set = [(X_val, y_val)],
             verbose = 0,
             early_stopping_rounds = early_stopping_rounds)
    parameters['n_estimators'] = model.best_iteration_
    parameters['score'] = model.best_score_['valid_0']['multi_logloss']
    
    print(f"{loop:2} best iteration = {parameters['n_estimators']}, Score = {parameters['score']:.6f}")
    coarse_hyperparameters_list.append(parameters)
    coarse_data = pd.DataFrame(coarse_hyperparameters_list).sort_values(by = 'score')
    coarse_data.to_csv(filename)
coarse_data.head(10)
    

 0 best iteration = 1000, Score = 1.959302
 1 best iteration = 1000, Score = 0.477120
 2 best iteration = 1000, Score = 1.447886
 3 best iteration = 1000, Score = 0.563794
 4 best iteration = 1000, Score = 0.476520
 5 best iteration = 1000, Score = 1.958634
 6 best iteration = 1000, Score = 1.959206
 7 best iteration = 1000, Score = 1.959175
 8 best iteration = 1000, Score = 1.959362
 9 best iteration = 1000, Score = 1.955063
10 best iteration = 1000, Score = 1.959364
11 best iteration = 1000, Score = 1.959269
12 best iteration = 1000, Score = 1.954499
13 best iteration = 1000, Score = 1.957079
14 best iteration = 1000, Score = 1.163607
15 best iteration = 1000, Score = 1.959349
16 best iteration = 1000, Score = 1.940409
17 best iteration = 1000, Score = 1.959367
18 best iteration = 1000, Score = 1.959016
19 best iteration = 1000, Score = 1.959366
20 best iteration = 999, Score = 0.466578
21 best iteration = 1000, Score = 1.939771
22 best iteration = 1000, Score = 1.959296
23 best iter

Unnamed: 0,colsample_bytree,learning_rate,loop,min_child_samples,n_estimators,num_leaves,random_state,reg_alpha,reg_lambda,score,subsample
97,0.839381,0.035627,97,155,471,115,42,9.167532e-05,0.08905643,0.464225,0.751067
93,0.685732,0.029846,93,105,686,77,42,1.064016e-09,0.00558039,0.464596,0.875661
20,0.323569,0.022379,20,198,999,133,42,2.022127e-06,5.76219e-06,0.466578,0.759019
96,0.945986,0.042999,96,140,331,167,42,0.07695343,7.120265e-13,0.467238,0.369918
91,0.481572,0.010885,91,36,1000,154,42,2.530934e-05,1.836402e-12,0.467917,0.995576
51,0.766205,0.025733,51,11,753,72,42,0.00160353,3.822898e-07,0.471233,0.701068
4,0.976127,0.013967,4,28,1000,90,42,8.323264e-10,2.418164e-15,0.47652,0.102678
26,0.204935,0.083572,26,61,290,150,42,1.202683e-06,5.551291e-08,0.47712,0.633842
1,0.896863,0.01261,1,48,1000,77,42,1.016945e-07,0.004000593,0.47712,0.70441
23,0.791783,0.014387,23,30,1000,53,42,7.246879e-09,8.115956e-10,0.482146,0.382464


## Finer Search

In [26]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
today = str(datetime.now())
filename = f'finer-search {today}.csv'
num_loop = 100
n_estimators = 1000

kf = StratifiedKFold(n_splits=5, random_state = random_state, shuffle=False)
finer_hyperparameters_list = []

for loop in range(num_loop):
    num_leaves = np.random.randint(100, 200) # 10 -> 100
    min_child_samples = np.random.randint(50, 100) # 2 -> 50
    subsample = np.random.uniform(0.4, 0.9) # (0.1, 1.0) -> (0.4, 0.9)
    colsample_bytree = np.random.uniform(0.4, 0.9) # (0.1, 1.0) -> (0.4, 0.9)
    learning_rate = 10 ** -np.random.uniform(low = 0.9, high = 3) # (1, 10) -> (0.9, 3)
    reg_alpha = 10 ** -np.random.uniform(low = 3, high = 8) # (1, 10) -> (3, 8)
    reg_lambda = 10 **  -np.random.uniform(low = 5, high = 15) # (1, 15) -> (5, 15)
    
    parameters = {'loop': loop,
                 'num_leaves': num_leaves,
                 'min_child_samples': min_child_samples,
                 'subsample': subsample,
                 'colsample_bytree': colsample_bytree,
                 'reg_alpha' : reg_alpha,
                 'reg_lambda': reg_lambda,
                 'n_estimators': n_estimators,
                 'learning_rate': learning_rate,
                 'random_state': random_state}
    
    fit_params = {'verbose': 0}
    model = LGBMClassifier(**parameters)
    
    score = cross_val_score(model, X, y, cv = kf, fit_params = fit_params, scoring = 'neg_log_loss').mean()
    score = -1.0 * score
    parameters['score'] = score
    
    
    print(f"{loop:2} best iteration = {parameters['n_estimators']}, Score = {parameters['score']:.6f}")
    finer_hyperparameters_list.append(parameters)
    finer_data = pd.DataFrame(finer_hyperparameters_list).sort_values(by = 'score')
    finer_data.to_csv(filename)
finer_data.head(10)
    



 0 best iteration = 1000, Score = 0.484933
 1 best iteration = 1000, Score = 0.750323
 2 best iteration = 1000, Score = 0.476805
 3 best iteration = 1000, Score = 0.604865
 4 best iteration = 1000, Score = 0.529964
 5 best iteration = 1000, Score = 0.451976
 6 best iteration = 1000, Score = 0.503133
 7 best iteration = 1000, Score = 0.465870
 8 best iteration = 1000, Score = 0.486997
 9 best iteration = 1000, Score = 0.515116
10 best iteration = 1000, Score = 0.490864
11 best iteration = 1000, Score = 0.452408
12 best iteration = 1000, Score = 0.662607
13 best iteration = 1000, Score = 0.562853
14 best iteration = 1000, Score = 0.730427
15 best iteration = 1000, Score = 0.475624
16 best iteration = 1000, Score = 0.517233
17 best iteration = 1000, Score = 0.479102
18 best iteration = 1000, Score = 0.505607
19 best iteration = 1000, Score = 0.510558
20 best iteration = 1000, Score = 0.813798
21 best iteration = 1000, Score = 0.459713
22 best iteration = 1000, Score = 0.499300
23 best ite

Unnamed: 0,colsample_bytree,learning_rate,loop,min_child_samples,n_estimators,num_leaves,random_state,reg_alpha,reg_lambda,score,subsample
97,0.666874,0.012523,97,70,1000,177,42,3.231114e-05,9.792592e-11,0.451595,0.662702
5,0.532131,0.011932,5,51,1000,163,42,0.0004062783,2.142414e-14,0.451976,0.492984
11,0.643945,0.011835,11,54,1000,168,42,6.314948e-07,1.413943e-07,0.452408,0.423878
60,0.472478,0.0207,60,54,1000,106,42,0.0004273285,1.771863e-12,0.452452,0.734406
71,0.597342,0.016636,71,71,1000,155,42,6.960306e-08,1.688497e-13,0.45318,0.778168
86,0.705918,0.014331,86,66,1000,122,42,2.053456e-07,3.094597e-07,0.453783,0.740094
91,0.496324,0.014816,91,89,1000,102,42,9.686146e-07,1.956898e-14,0.453834,0.607934
68,0.733351,0.011267,68,91,1000,142,42,2.311402e-05,1.606608e-06,0.455387,0.668473
73,0.84957,0.010322,73,87,1000,192,42,0.0003531752,1.615061e-15,0.45637,0.869767
46,0.804699,0.018601,46,62,1000,101,42,2.353404e-08,2.943259e-08,0.458568,0.488461


In [28]:
finer_data.iloc[1]

colsample_bytree     5.321307e-01
learning_rate        1.193170e-02
loop                 5.000000e+00
min_child_samples    5.100000e+01
n_estimators         1.000000e+03
num_leaves           1.630000e+02
random_state         4.200000e+01
reg_alpha            4.062783e-04
reg_lambda           2.142414e-14
score                4.519759e-01
subsample            4.929844e-01
Name: 5, dtype: float64

In [35]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(colsample_bytree=     5.321307e-01,
                        learning_rate=        1.193170e-02,
                        loop=                 5,
                        min_child_samples=    51,
                        n_estimators=         1000,
                        num_leaves=           163,
                        reg_alpha=            4.062783e-04,
                        reg_lambda=           2.142414e-14,
                        subsample=            4.929844e-01,
                       random_state = random_state)


model

LGBMClassifier(colsample_bytree=0.5321307, learning_rate=0.0119317, loop=5,
               min_child_samples=51, n_estimators=1000, num_leaves=163,
               random_state=42, reg_alpha=0.0004062783, reg_lambda=2.142414e-14,
               subsample=0.4929844)

## Model Evaluation

In [45]:
kf = StratifiedKFold(n_splits = 5,
                        random_state = random_state,
                        shuffle = False)

fit_params = {'verbose' : 0}
score = cross_val_score(model, X_train, y_train, cv = kf, 
                       fit_params = fit_params, scoring = 'neg_log_loss').mean()

score = -1.0 * score
print(f"Score = {score:.5f}")

Score = 0.46208


## Predict and Submission

In [46]:
model.fit(X, y)

LGBMClassifier(colsample_bytree=0.5321307, learning_rate=0.0119317, loop=5,
               min_child_samples=51, n_estimators=1000, num_leaves=163,
               random_state=42, reg_alpha=0.0004062783, reg_lambda=2.142414e-14,
               subsample=0.4929844)

In [47]:
prediction = model.predict_proba(X_test)

In [48]:
submit = pd.read_csv('data/sampleSubmission.csv', index_col = 'id')
submit = pd.DataFrame(prediction, index = submit.index, columns = model.classes_)
print(submit.shape)
submit.head()

(144368, 9)


Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.000214,0.13332,0.130002,0.730854,6.5e-05,0.000905,0.003594,0.000775,0.000271
2,0.001494,0.016353,0.003402,0.000892,0.000349,0.35474,0.00115,0.620089,0.001531
3,1.5e-05,9.2e-05,7.8e-05,3.2e-05,5e-06,0.99899,2.4e-05,0.000739,2.7e-05
4,0.000272,0.625624,0.366435,0.003672,9.8e-05,0.000944,0.000402,0.000732,0.001822
5,0.13501,0.001768,0.002398,0.000611,0.000273,0.006154,0.00374,0.062923,0.787123


In [49]:
submit.to_csv('otto.csv')