In [2]:
import gc
import sys

sys.path.append('..')

import numpy as np
import pandas as pd
import scipy.stats as sps

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline 

from src.utils.cross_validation import TimeSeriesGroupSplit

sns.set(font_scale=1.2)
%matplotlib inline

In [3]:
random_state = 42

# Random forest

In this notebook we will produce predictions by random forest.

## Loading datasets

In this section we will load all datasets and prepare them for training.

In [4]:
train = pd.read_hdf('../data/processed/train.h5', 'table')
test = pd.read_hdf('../data/processed/test.h5', 'table')

items = pd.read_csv('../data/processed/items.csv')
tfidf_truncated_svd = pd.read_hdf('../data/processed/text/tfidf_truncated-svd.h5', 'table')

Add text features to train.

In [5]:
tfidf_truncated_svd['item_id'] = items.item_id

In [6]:
train = pd.merge(
    train,
    tfidf_truncated_svd,
    how='left', on='item_id'
)
gc.collect();

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6332358 entries, 0 to 6332357
Columns: 111 entries, shop_id to item_name_tfidf_truncated-svd_49
dtypes: bool(2), float32(94), int32(9), object(6)
memory usage: 2.8+ GB


Remain only rows, that contains item_id, shop_id that present in test dataset. Without this cut our data will be too big for training

In [8]:
train = train[train.item_in_test & train.shop_in_test]
train.drop(columns=['item_in_test', 'shop_in_test'], inplace=True)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2259006 entries, 7551 to 6332357
Columns: 109 entries, shop_id to item_name_tfidf_truncated-svd_49
dtypes: float32(94), int32(9), object(6)
memory usage: 1008.2+ MB


Remove target from train.

In [10]:
y = train.target
train.drop(columns=['target'], inplace=True)

Remove text columns.

In [24]:
X_train.dtypes[X_train.dtypes == 'object']

item_name                  object
item_full_category_name    object
item_category_name         object
item_subcategory_name      object
shop_name                  object
city                       object
dtype: object

Fill NaNs.

In [11]:
train.columns[train.isna().sum() > 0]

Index(['num_residents'], dtype='object')

As we expected there are some problems only with `num_residents`. We can fill it with zero, because it will be border value for this feature, trees can handle it properly.

In [12]:
train.fillna(0, inplace=True)

Create validation split.

In [13]:
X_valid = train[train.date_block_num == 33]
X_train = train[train.date_block_num < 33]
y_valid = y[train.date_block_num == 33]
y_train = y[train.date_block_num < 33]
del train
gc.collect()

116

## Grid search

In this section we will find optimum parameters for a model.

## Validation

In this section we will validate best parameters using haldout.

In [14]:
rf = RandomForestRegressor(n_estimators=10, random_state=random_state)

In [17]:
rf.fit(X_train, y_train)

ValueError: could not convert string to float: 'ГАДКИЙ Я 1-2 (BD)'

## Submit

In this section we will train result model and submit prediction.

## OOF predictions

In this section we will create out-of-fold predictions for stacking.