# Model tuning
- Interactive tuning
- Parameters selection
- Clear solution pipeline

In [1]:
# Modules import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn import linear_model, metrics, preprocessing

from time import time
from tqdm import tqdm

# Settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
sns.set(style="white")

In [2]:
data_validation = pd.read_csv('./results/dataset_for_validation.csv').drop('index', axis=1)
data_full = pd.read_csv('./results/dataset_for_test.csv')
val_answers = pd.read_csv('./results/validation_answers.csv', header=None)

In [3]:
submision_sample = pd.read_csv('./readonly/sample_submission.csv')

In [4]:
data_validation.replace({np.inf: -1}, inplace=True)
data_full.replace({np.inf: -1}, inplace=True)

In [5]:
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error as mse

Few words should be said about unpacking our data. To separate our test and train data we use "date_block_num" value, but then we should make a feature out of it by making a number of month out of it.

Validation, methods comparison.

In [11]:
X_train = data_validation[data_validation['date_block_num'] < 33].drop('target', axis=1)
X_test = data_validation[data_validation['date_block_num'] == 33].drop('target', axis=1)
y_train = data_validation[data_validation['date_block_num'] < 33]['target']
y_test = val_answers[1]

X_train['date_block_num'] %= 12
X_test['date_block_num'] %= 12

**Basic lgbm**. It was vey helpful to use it to find out if dataset was built correctly without leakages and with needed features. Also it can be used to understand number of records from data to train on (training on the whole thing lead to overfitting).

In [28]:
time_start = time()
lgbm = LGBMRegressor(n_estimators=1000, n_jobs=8)
lgbm.fit(X_train, y_train)

print("mse is", mse(y_test, lgbm.predict(X_test).clip(0, 20)))
print("Time spent:", time() - time_start)

mse is 2.4175045385523966
Time spent: 99.65704131126404


In [29]:
print("mse is", mse(y_train[:100000], lgbm.predict(X_train[:100000]).clip(0, 20)))

mse is 1.169130886067649


In [30]:
feature_imp = pd.DataFrame(sorted(zip(lgbm.feature_importances_, X_train.columns)), columns=['Value','Feature'])
feature_imp[-10:]

Unnamed: 0,Value,Feature
122,873,item_category_id_revenue_mean_lag_2
123,1033,item_id_revenue_mean_lag_2
124,1208,revenue_std
125,1280,revenue_lag_1
126,1360,target_std
127,1466,item_category_id_revenue_mean_lag_1
128,1526,target_lag_1
129,1576,date_block_num
130,1707,item_id_revenue_mean_lag_1
131,1746,item_id_target_mean_lag_1


**Bagging model**.

In [7]:
class BaggingModel():
    def __init__(self, arguments: list, name, verbose=False):
        self.bag_of_models = []
        self.name = name
        self.verbose=verbose
        
        for pair in arguments:
            if 'lgbm' == pair[0]:
                self.bag_of_models.append(LGBMRegressor(**pair[1]))

            if 'knn' == pair[0]:
                self.bag_of_models.append(KNeighborsRegressor(**pair[1]))
                
        
    def predict(self, X_test):
        answer = np.zeros(X_test.shape[0])
        
        if self.verbose:
            print(self.name, "is predicting...")
            for model in tqdm(self.bag_of_models):
                answer +=  model.predict(X_test)
            print(self.name, "finished predicting")
            
        else:
            for model in self.bag_of_models:
                answer +=  model.predict(X_test)
            
        answer /= len(self.bag_of_models)
        
        return answer
        
        
    def fit(self, X_train, y_train):    
        if self.verbose:
            print(self.name, "is fitting...")
            for model in tqdm(self.bag_of_models):
                model.fit(X_train, y_train)
            print(self.name, "finished fitting")
                
        else:
            for model in self.bag_of_models:
                model.fit(X_train, y_train) 

In [73]:
time_start = time()
model = BaggingModel([('lgbm', {'n_estimators': 64, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 64}), 
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 32}),
                       ('lgbm', {'n_estimators': 128, 'n_jobs': 8, 'random_state': 42, 'num_leaves': 16})
                     ], 'LGBM_Bagging', True)
model.fit(X_train.loc[mask], y_train[mask])

print("mse is", mse(y_test, model.predict(X_test.fillna(0))))
print("Time spent:", time() - time_start)

LGBM_Bagging is fitting...


100%|████████████████████████████████████████████| 3/3 [00:11<00:00,  3.67s/it]


LGBM_Bagging finished fitting
LGBM_Bagging is predicting...


100%|████████████████████████████████████████████| 3/3 [00:03<00:00,  1.30s/it]


LGBM_Bagging finished predicting
mse is 1.6380050015025853
Time spent: 15.728785514831543


**Stacking**.

In [6]:
class StackingModel():
    def __init__(self, name):
        self.name = name
        self.levels = []
        
    def append(self, models: list):
        assert models != [], "Input is empty"
        self.levels.append(models)
    
    def predict(self, X_test):
        assert self.levels != [], "Model is empty"
        assert self.levels[-1], "Model has wrong output"
        
        print(self.name, "is predicting...")
        
        level_output = X_test
        for level in tqdm(self.levels):
            level_output = np.concatenate([
                model.predict(level_output).reshape(-1, 1) for model in level
            ], axis=1)
        
        print(self.name, "finished predicting")
        
        return level_output.reshape(-1, 1)
        
    def fit(self, X_train, y_train):
       
        assert self.levels != [], "Model is empty"
        assert self.levels[-1], "Model has wrong output"
        
        print(self.name, "is fitting...")
        
        level_output = X_train
        for level in tqdm(self.levels):
            for model in level:
                model.fit(level_output, y_train)
            
            level_output = np.concatenate([
                model.predict(level_output).reshape(-1, 1) for model in level
            ], axis=1)

        print(self.name, "finished fitting")

In [75]:
time_start = time()
model = StackingModel("Stacking Machine")
model.append([LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8)])
model.append([LGBMRegressor(n_jobs=8), LGBMRegressor(n_jobs=8)])
model.append([KNeighborsRegressor()])
model.fit(X_train.loc[mask], y_train[mask])

print("mse is", mse(y_test, model.predict(X_test.fillna(0))))
print("Time spent:", time() - time_start)

Stacking Machine is fitting...


100%|████████████████████████████████████████████| 3/3 [00:14<00:00,  4.74s/it]


Stacking Machine finished fitting
Stacking Machine is predicting...


100%|████████████████████████████████████████████| 3/3 [00:06<00:00,  2.04s/it]


Stacking Machine finished predicting
mse is 2.9252942461062728
Time spent: 21.404297590255737


**Solvator-2000**

In [12]:
time_start = time()

# Model building
lgbm_vr = VotingRegressor([
    ('lg1', LGBMRegressor(n_jobs=8, n_estimators=1024, random_state=42, num_leaves=16)), 
    ('lg2', LGBMRegressor(n_jobs=8, n_estimators=512, random_state=16, num_leaves=16)), 
    ('lg3', LGBMRegressor(n_jobs=8, n_estimators=256, random_state=2, num_leaves=16))])

model = StackingModel("Solvator-2000")

model.append([lgbm_vr, XGBRegressor(max_depth=8, n_estimators=1024,
    min_child_weight=300, colsample_bytree=0.8, subsample=0.8, eta=0.3)])
model.append([LGBMRegressor(n_jobs=8, n_estimators=512, random_state=42)])

# Model fit
model.fit(X_train, y_train)

print("mse is", mse(y_test, model.predict(X_test).clip(0, 20)))
print("Time spent:", time() - time_start)

Solvator-2000 is fitting...


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




KeyboardInterrupt: 

In [None]:
print("mse is", mse(y_train[:100000], model.predict(X_train[:100000].fillna(0)).clip(0, 20)))

In [None]:
print("mse is", mse(y_train[mask], model.predict(X_train.loc[mask].fillna(0)).clip(0, 20)))

For submission lgbm looks better because of it's balance with overfitting.

In [None]:
X_train_subm = data_full[data_full['date_block_num'] < 34].drop('target', axis=1)
X_test_subm = data_full[data_full['date_block_num'] == 34].drop('target', axis=1)
y_train_subm = data_full[data_full['date_block_num'] < 34]['target']

X_train_subm['date_block_num'] %= 12
X_test_subm['date_block_num'] %= 12

In [250]:
X_test_subm = data_full[data_full['date_block_num'] == 34].drop('target', axis=1)

X_test_subm['date_block_num'] %= 12

In [9]:
n = 40000
mask = np.random.choice(X_train_subm.index, n, replace=False)

In [10]:
time_start = time()
lgbm = LGBMRegressor(n_estimators=100, n_jobs=8)
lgbm.fit(X_train_subm.loc[mask], y_train_subm[mask])

print("Time spent:", time() - time_start)

Time spent: 9.335937738418579


In [251]:
submision_sample['item_cnt_month'] = model.predict(X_test_subm)
submision_sample['item_cnt_month'] = submision_sample['item_cnt_month'].clip(0, 20).astype('float32')

Solvator-2000 is predicting...







  0%|                                                    | 0/2 [00:00<?, ?it/s]




 50%|██████████████████████                      | 1/2 [00:05<00:05,  5.42s/it]




100%|████████████████████████████████████████████| 2/2 [00:05<00:00,  2.84s/it]


Solvator-2000 finished predicting


In [252]:
submision_sample.to_csv('./results/submission_lgbm.csv', index=False)