# Model tuning
- Interactive tuning
- Parameters selection
- Clear solution pipeline

In [1]:
# Modules import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn import linear_model, metrics, preprocessing

from time import time

# Settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
sns.set(style="white")

In [2]:
data_validation = pd.read_csv('./results/dataset_for_validation.csv').drop('index', axis=1)
data_full = pd.read_csv('./results/dataset_for_test.csv').drop('category_0', axis=1)
val_answers = pd.read_csv('./results/validation_answers.csv', header=None)

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error as mse

Few words should be said about unpacking our data. To separate our test and train data we use "date_block_num" value, but then we should make a feature out of it by making a number of month out of it.

Validation, methods comparison.

In [4]:
X_train = data_validation[data_validation['date_block_num'] < 33].drop('target', axis=1)
X_test = data_validation[data_validation['date_block_num'] == 33].drop('target', axis=1)
y_train = data_validation[data_validation['date_block_num'] < 33]['target']
y_test = val_answers[1]

X_train['date_block_num'] %= 12
X_test['date_block_num'] %= 12

**Basic lgbm**. It was vey helpful to use it to find out if dataset was built correctly without leakages and with needed features.

In [5]:
time_start = time()
random_forest = LGBMRegressor(n_estimators=100, n_jobs=2)
random_forest.fit(X_train, y_train)

print("mse is", mse(y_test, random_forest.predict(X_test.fillna(0))))
print("Time spent:", time() - time_start)

mse is 1.5237335160776573
Time spent: 0.8008608818054199


Submission

In [6]:
X_train_subm = data_full[data_full['date_block_num'] < 34].drop('target', axis=1)
X_test_subm = data_full[data_full['date_block_num'] == 34].drop('target', axis=1)
y_train_subm = data_full[data_full['date_block_num'] < 34]['target']
y_test_subm = data_full[data_full['date_block_num'] == 34]['target']

X_train_subm['date_block_num'] %= 12
X_test_subm['date_block_num'] %= 12

In [7]:
time_start = time()
random_forest = LGBMRegressor(n_estimators=50, n_jobs=2)
random_forest.fit(X_train_subm, y_train_subm)

print("Time spent:", time() - time_start)

Time spent: 0.42386698722839355


In [8]:
submm = pd.read_csv('./readonly/sample_submission.csv')
submm['item_cnt_month'] = random_forest.predict(X_test_subm)
submm.to_csv('./results/sample_submission.csv', index=False)