# Model tuning
- Interactive tuning
- Parameters selection
- Clear solution pipeline

In [1]:
# Modules import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn import linear_model, metrics, preprocessing

from time import time
from tqdm import tqdm

# Settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
sns.set(style="white")

In [2]:
data_validation = pd.read_csv('./results/dataset_for_validation.csv').drop('index', axis=1)
data_full = pd.read_csv('./results/dataset_for_test.csv')
val_answers = pd.read_csv('./results/validation_answers.csv', header=None)

In [3]:
submision_sample = pd.read_csv('./readonly/sample_submission.csv')

In [4]:
data_validation['revenue_std'].replace({np.inf: -1}, inplace=True)
data_full['revenue_std'].replace({np.inf: -1}, inplace=True)

In [5]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, SGDRegressor, SGDClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from sklearn.metrics import mean_squared_error as mse

Few words should be said about unpacking our data. To separate our test and train data we use "date_block_num" value, but then we should make a feature out of it by making a number of month out of it.

Validation, methods comparison.

In [9]:
X_train = data_validation[data_validation['date_block_num'] < 33].drop('target', axis=1)
X_test = data_validation[data_validation['date_block_num'] == 33].drop('target', axis=1)
y_train = data_validation[data_validation['date_block_num'] < 33]['target']
y_test = val_answers[1]

X_train['date_block_num'] %= 12
X_test['date_block_num'] %= 12

**Basic lgbm**. It was vey helpful to use it to find out if dataset was built correctly without leakages and with needed features. Also it can be used to understand number of records from data to train on (training on the whole thing lead to overfitting).

In [58]:
n = 40000
mask = np.random.choice(X_train.index, n, replace=False)

In [69]:
time_start = time()
lgbm = LGBMRegressor(n_estimators=100, n_jobs=8)
lgbm.fit(X_train.loc[mask], y_train[mask])

print("mse is", mse(y_test, lgbm.predict(X_test.fillna(0)).clip(0, 20)))
print("Time spent:", time() - time_start)

mse is 1.6504949148937924
Time spent: 6.488531589508057


In [70]:
print("mse is", mse(y_train[:100000], lgbm.predict(X_train[:100000].fillna(0)).clip(0, 20)))

mse is 1.1890721863779208


In [71]:
feature_imp = pd.DataFrame(sorted(zip(lgbm.feature_importances_, X_train.columns)), columns=['Value','Feature'])
feature_imp[-10:]

Unnamed: 0,Value,Feature
179,75,shop_id_freq_lag_1
180,78,item_category_id_freq_lag_1
181,78,revenue_lag_2
182,92,revenue_lag_1
183,98,revenue_std
184,98,target_std
185,103,item_id_freq_lag_1
186,123,target_lag_1
187,130,item_id_target_mean_lag_1
188,173,date_block_num


**Bagging model**.

In [11]:
class BaggingModel():
    def __init__(self, arguments: list, name, verbose=False):
        self.bag_of_models = []
        self.name = name
        self.verbose=verbose
        
        for pair in arguments:
            if 'lgbm' == pair[0]:
                self.bag_of_models.append(LGBMRegressor(**pair[1]))

            if 'knn' == pair[0]:
                self.bag_of_models.append(KNeighborsRegressor(**pair[1]))
                
        
    def predict(self, X_test):
        answer = np.zeros(X_test.shape[0])
        
        if self.verbose:
            print(self.name, "is predicting...")
            for model in tqdm(self.bag_of_models):
                answer +=  model.predict(X_test)
            print(self.name, "finished predicting")
            
        else:
            for model in self.bag_of_models:
                answer +=  model.predict(X_test)
            
        answer /= len(self.bag_of_models)
        
        return answer
        
        
    def fit(self, X_train, y_train):    
        if self.verbose:
            print(self.name, "is fitting...")
            for model in tqdm(self.bag_of_models):
                model.fit(X_train, y_train)
            print(self.name, "finished fitting")
                
        else:
            for model in self.bag_of_models:
                model.fit(X_train, y_train) 

In [73]:
time_start = time()
model = BaggingModel([('lgbm', {'n_estimators': 64, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 64}), 
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 32}),
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 16})
                     ], 'LGBM_Bagging', True)
model.fit(X_train.loc[mask], y_train[mask])

print("mse is", mse(y_test, model.predict(X_test.fillna(0))))
print("Time spent:", time() - time_start)

LGBM_Bagging is fitting...


100%|████████████████████████████████████████████| 3/3 [00:11<00:00,  3.67s/it]


LGBM_Bagging finished fitting
LGBM_Bagging is predicting...


100%|████████████████████████████████████████████| 3/3 [00:03<00:00,  1.30s/it]


LGBM_Bagging finished predicting
mse is 1.6380050015025853
Time spent: 15.728785514831543


**Stacking**.

In [12]:
class StackingModel():
    def __init__(self, name):
        self.name = name
        self.levels = []
        
    def append(self, models: list):
        assert models != [], "Input is empty"
        self.levels.append(models)
    
    def predict(self, X_test):
        assert self.levels != [], "Model is empty"
        assert self.levels[-1], "Model has wrong output"
        
        print(self.name, "is predicting...")
        
        level_output = X_test
        for level in tqdm(self.levels):
            level_output = np.concatenate([
                model.predict(level_output).reshape(-1, 1) for model in level
            ], axis=1)
        
        print(self.name, "finished predicting")
        
        return level_output.reshape(-1, 1)
        
    def fit(self, X_train, y_train):
       
        assert self.levels != [], "Model is empty"
        assert self.levels[-1], "Model has wrong output"
        
        print(self.name, "is fitting...")
        
        level_output = X_train
        for level in tqdm(self.levels):
            for model in level:
                model.fit(level_output, y_train)
            
            level_output = np.concatenate([
                model.predict(level_output).reshape(-1, 1) for model in level
            ], axis=1)

        print(self.name, "finished fitting")

In [75]:
time_start = time()
model = StackingModel("Stacking Machine")
model.append([LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8)])
model.append([LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8)])
model.append([KNeighborsRegressor()])
model.fit(X_train.loc[mask], y_train[mask])

print("mse is", mse(y_test, model.predict(X_test.fillna(0))))
print("Time spent:", time() - time_start)

Stacking Machine is fitting...


100%|████████████████████████████████████████████| 3/3 [00:14<00:00,  4.74s/it]


Stacking Machine finished fitting
Stacking Machine is predicting...


100%|████████████████████████████████████████████| 3/3 [00:06<00:00,  2.04s/it]


Stacking Machine finished predicting
mse is 2.9252942461062728
Time spent: 21.404297590255737


**Solvator-2000**

In [21]:
n = 20000
mask = np.random.choice(X_train.index, n, replace=False)

In [22]:
time_start = time()

# Model building
bag1 = BaggingModel([('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 64}), 
                       ('lgbm', {'n_estimators': 64, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 32}),
                       ('lgbm', {'n_estimators': 64, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 16})
                     ], 'LGBM_Bagging', False)
bag2 = BaggingModel([('knn', {'leaf_size': 32, 'n_neighbors': 5, 'n_jobs': 8}), 
                       ('knn', {'leaf_size': 32, 'n_neighbors': 10, 'n_jobs': 8}), 
                       ('knn', {'leaf_size': 64, 'n_neighbors': 5, 'n_jobs': 8})
                     ], 'kNN_Bagging', False)
model = StackingModel("Solvator-2000")
model.append([bag1, bag2])
model.append([SGDRegressor(), RandomForestRegressor(n_estimators=64, n_jobs= 8, random_state=42)])
model.append([LinearRegression()])

# Model fit
model.fit(X_train.loc[mask], y_train[mask])

print("mse is", mse(y_test, model.predict(X_test.fillna(0)).clip(0, 20)))
print("Time spent:", time() - time_start)

Solvator-2000 is fitting...



  0%|                                                    | 0/3 [00:00<?, ?it/s]
 33%|██████████████▋                             | 1/3 [00:12<00:25, 12.66s/it]
100%|████████████████████████████████████████████| 3/3 [00:13<00:00,  4.34s/it]


Solvator-2000 finished fitting
Solvator-2000 is predicting...



  0%|                                                    | 0/3 [00:00<?, ?it/s]
 33%|██████████████▎                            | 1/3 [01:53<03:46, 113.23s/it]
100%|████████████████████████████████████████████| 3/3 [01:53<00:00, 37.82s/it]


Solvator-2000 finished predicting
mse is 2.0289469756902623
Time spent: 126.90055966377258


For submission lgbm looks better because of it's balance with overfitting.

In [8]:
X_train_subm = data_full[data_full['date_block_num'] < 34].drop('target', axis=1)
X_test_subm = data_full[data_full['date_block_num'] == 34].drop('target', axis=1)
y_train_subm = data_full[data_full['date_block_num'] < 34]['target']

X_train_subm['date_block_num'] %= 12
X_test_subm['date_block_num'] %= 12

In [9]:
n = 40000
mask = np.random.choice(X_train_subm.index, n, replace=False)

In [10]:
time_start = time()
lgbm = LGBMRegressor(n_estimators=100, n_jobs=8)
lgbm.fit(X_train_subm.loc[mask], y_train_subm[mask])

print("Time spent:", time() - time_start)

Time spent: 9.335937738418579


In [31]:
submision_sample['item_cnt_month'] = lgbm.predict(X_test_subm)
submision_sample['item_cnt_month'] = submision_sample['item_cnt_month'].apply(int).clip(0, 20).astype('float32')

In [32]:
submision_sample.to_csv('./results/submission_lgbm.csv', index=False)