# Model tuning
- Interactive tuning
- Parameters selection
- Clear solution pipeline

In [74]:
# Modules import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn import linear_model, metrics, preprocessing

from time import time
from tqdm import tqdm

# Settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
sns.set(style="white")

In [75]:
data_validation = pd.read_csv('./results/dataset_for_validation.csv').drop('index', axis=1)
data_full = pd.read_csv('./results/dataset_for_test.csv').drop('category_0', axis=1)
val_answers = pd.read_csv('./results/validation_answers.csv', header=None)

In [76]:
submision_sample = pd.read_csv('./readonly/sample_submission.csv')

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error as mse

Few words should be said about unpacking our data. To separate our test and train data we use "date_block_num" value, but then we should make a feature out of it by making a number of month out of it.

Validation, methods comparison.

In [78]:
X_train = data_validation[data_validation['date_block_num'] < 33].drop('target', axis=1)
X_test = data_validation[data_validation['date_block_num'] == 33].drop('target', axis=1)
y_train = data_validation[data_validation['date_block_num'] < 33]['target']
y_test = val_answers[1]

X_train['date_block_num'] %= 12
X_test['date_block_num'] %= 12

**Basic lgbm**. It was vey helpful to use it to find out if dataset was built correctly without leakages and with needed features.

In [79]:
time_start = time()
lgbm = LGBMRegressor(n_estimators=100, n_jobs=2)
#lgbm = KNeighborsRegressor(n_neighbors=20)
lgbm.fit(X_train[:100000], y_train[:100000])

print("mse is", mse(y_test, lgbm.predict(X_test.fillna(0))))
print("Time spent:", time() - time_start)

mse is 2.219623225828727
Time spent: 3.5814707279205322


In [80]:
print("mse is", mse(y_train[:100000], lgbm.predict(X_train[:100000].fillna(0))))

mse is 0.9915789390879018


In [81]:
feature_imp = pd.DataFrame(sorted(zip(lgbm.feature_importances_, X_train.columns)), columns=['Value','Feature'])
feature_imp[-20:]

**Bagging model**.

In [7]:
class BaggingModel():
    def __init__(self, arguments: list, name, verbose=False):
        self.bag_of_models = []
        self.name = name
        self.verbose=verbose
        
        for pair in arguments:
            if 'lgbm' == pair[0]:
                self.bag_of_models.append(LGBMRegressor(**pair[1]))

            if 'knn' == pair[0]:
                self.bag_of_models.append(KNeighborsRegressor(**pair[1]))
                
        
    def predict(self, X_test):
        answer = np.zeros(X_test.shape[0])
        
        if self.verbose:
            print(self.name, "is predicting...")
            for model in tqdm(self.bag_of_models):
                answer +=  model.predict(X_test)
            print(self.name, "finished predicting")
            
        else:
            for model in self.bag_of_models:
                answer +=  model.predict(X_test)
            
        answer /= len(self.bag_of_models)
        
        return answer
        
        
    def fit(self, X_train, y_train):    
        if self.verbose:
            print(self.name, "is fitting...")
            for model in tqdm(self.bag_of_models):
                model.fit(X_train, y_train)
            print(self.name, "finished fitting")
                
        else:
            for model in self.bag_of_models:
                model.fit(X_train, y_train) 

In [84]:
feature_imp[-10:]

Unnamed: 0,Value,Feature
155,89,item_id_target_mean_lag_2
156,89,unique_items_sold_by_shop_lag_1
157,100,target_lag_2
158,117,shop_id_target_mean_lag_1
159,134,target_std
160,140,item_id_freq_lag_1
161,147,revenue_std
162,150,target_lag_1
163,178,revenue_lag_1
164,298,item_id_target_mean_lag_1


In [8]:
time_start = time()
model = BaggingModel([('lgbm', {'n_estimators': 64, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 64}), 
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 32}),
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 16})
                     ], 'LGBM_Bagging', True)
model.fit(X_train, y_train)

print("mse is", mse(y_test, model.predict(X_test.fillna(0))))
print("Time spent:", time() - time_start)

LGBM_Bagging is fitting...


100%|████████████████████████████████████████████| 3/3 [01:00<00:00, 20.32s/it]


LGBM_Bagging finished fitting
LGBM_Bagging is predicting...


100%|████████████████████████████████████████████| 3/3 [00:00<00:00,  4.13it/s]


LGBM_Bagging finished predicting
mse is 4.284218361964238
Time spent: 61.938628911972046


**Stacking**.

In [49]:
data_full.shape

(1265992, 211)

In [9]:
class StackingModel():
    def __init__(self, name):
        self.name = name
        self.levels = []
        
    def append(self, models: list):
        assert models != [], "Input is empty"
        self.levels.append(models)
    
    def predict(self, X_test):
        assert self.levels != [], "Model is empty"
        assert self.levels[-1], "Model has wrong output"
        
        print(self.name, "is predicting...")
        
        level_output = X_test
        for level in tqdm(self.levels):
            level_output = np.concatenate([
                model.predict(level_output).reshape(-1, 1) for model in level
            ], axis=1)
        
        print(self.name, "finished predicting")
        
        return level_output.reshape(-1, 1)
        
    def fit(self, X_train, y_train):
       
        assert self.levels != [], "Model is empty"
        assert self.levels[-1], "Model has wrong output"
        
        print(self.name, "is fitting...")
        
        level_output = X_train
        for level in tqdm(self.levels):
            for model in level:
                model.fit(level_output, y_train)
            
            level_output = np.concatenate([
                model.predict(level_output).reshape(-1, 1) for model in level
            ], axis=1)

        print(self.name, "finished fitting")

In [10]:
time_start = time()
model = StackingModel("Stacking Machine")
model.append([LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8)])
model.append([LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8)])
model.append([KNeighborsRegressor()])
model.fit(X_train, y_train)

print("mse is", mse(y_test, model.predict(X_test.fillna(0))))
print("Time spent:", time() - time_start)

Stacking Machine is fitting...


  0%|                                                    | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

**Solvator-2000**

In [39]:
a = pd.DataFrame(lgbm.predict(X_test.fillna(0)).reshape(-1, 1))
a[1] = y_test

In [None]:
time_start = time()

# Model building
bag1 = BaggingModel([('lgbm', {'n_estimators': 64, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 64}), 
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 32}),
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 16})
                     ], 'LGBM_Bagging', False)
bag2 = BaggingModel([('knn', {'leaf_size': 64, 'n_neighbors': 50, 'n_jobs': 8}), 
                       ('knn', {'leaf_size': 32, 'n_neighbors': 100, 'n_jobs': 8}), 
                       ('knn', {'leaf_size': 64, 'n_neighbors': 10, 'n_jobs': 8})
                     ], 'kNN_Bagging', False)
model = StackingModel("Solvator-2000")
model.append([bag1, bag2])
model.append([SGDRegressor(), RandomForestRegressor(n_estimators=128, n_jobs= 8, random_state=42)])
model.append([LinearRegression()])

# Model fit
model.fit(X_train, y_train)

print("mse is", mse(y_test, model.predict(X_test.fillna(0)).clip(0, 20)))
print("Time spent:", time() - time_start)

In [43]:
a[0].max()

19.972041861393535

For submission we chose Solvator-2000 because of the cool name.

In [None]:
X_train_subm = data_full[data_full['date_block_num'] < 34].drop('target', axis=1)
X_test_subm = data_full[data_full['date_block_num'] == 34].drop('target', axis=1)
y_train_subm = data_full[data_full['date_block_num'] < 34]['target']

X_train_subm['date_block_num'] %= 12
X_test_subm['date_block_num'] %= 12

In [None]:
time_start = time()

# Model building
bag1 = BaggingModel([('lgbm', {'n_estimators': 64, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 64}), 
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 32}),
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 16})
                     ], 'LGBM_Bagging', False)
bag2 = BaggingModel([('knn', {'leaf_size': 64, 'n_neighbors': 50, 'n_jobs': 8}), 
                       ('knn', {'leaf_size': 32, 'n_neighbors': 100, 'n_jobs': 8}), 
                       ('knn', {'leaf_size': 64, 'n_neighbors': 10, 'n_jobs': 8})
                     ], 'kNN_Bagging', False)
model = StackingModel("Solvator-2000")
model.append([bag1, bag2])
model.append([SGDRegressor(), RandomForestRegressor(n_estimators=128, n_jobs= 8, random_state=42)])
model.append([LinearRegression()])

# Model fit
model.fit(X_train_subm, y_train_subm)

print("Time spent:", time() - time_start)

In [None]:
submision_sample['item_cnt_month'] = model.predict(X_test_subm).clip(0, 20)
submision_sample.to_csv('./results/submission.csv', index=False)