In [1]:
import numpy as np
import pandas as pd

import plotly.express as px

from itertools import product
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import time

In [2]:
# Load all data
items = pd.read_csv('items.csv')
shops = pd.read_csv('shops.csv')
cats = pd.read_csv('item_categories.csv')
train = pd.read_csv('sales_train.csv')
test  = pd.read_csv('test.csv').set_index('ID')

In [3]:
# Number of items
group = items.groupby(['item_category_id'])['item_id'].count()
group = group.sort_values(ascending=False)
group = group.iloc[0:10].reset_index()

group.columns = ['item_category_id', 'count']
group['item_category_id'] = group['item_category_id'].astype(str)

fig = px.bar(group, x='item_category_id', y='count', title='Number of items per category')
fig.show()

In [4]:
# Visualization of time trend
# Here we see that most of the sales are done at Christmas
train_targets = train.groupby(["date_block_num"])["item_cnt_day"].sum()
fig = px.line(train_targets, title='Trend of target variable')
fig.show()

In [5]:
# Distribution of target variable
# On this plot we can see that majority of target variables has value 110.000
fig = px.histogram(train_targets, nbins=20, title="Distribution of target variable")
fig.show()

In [6]:
# Text features
# Extracting type of item from item_category_name
cats['type'] = cats['item_category_name'].str.split('-').map(lambda x: x[0].strip())
cats['type_id'] = LabelEncoder().fit_transform(cats['type'])

# Extracting subtype of item from item_category_name
cats['subtype'] = cats['item_category_name'].str.split('-').map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_id'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_id', 'subtype_id']]

# Extracting feature city from shop_name
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops['city_id'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_id']]

# Dropping item_name column because we already have item_id
items.drop(['item_name'], axis=1, inplace=True)

In [7]:
# Creating table which contains all combinations of data_block_num, shop_id and item_id
full_df = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num == i]
    full_df.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique()))))

full_df = pd.DataFrame(np.vstack(full_df), columns=cols)
full_df.sort_values(cols, inplace=True)

In [8]:
# Creating feature revenue
train['revenue'] = train['item_price'] * train['item_cnt_day']

In [9]:
# Creating target variable which is sum of variable item_cnt_day in each month
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
full_df = pd.merge(full_df, group, on=cols, how='left')
full_df['item_cnt_month'] = (full_df['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float16))

In [10]:
# Test block contains only data for the last month which is 34`s one
test['date_block_num'] = 34

In [11]:
# Concatenation train and test data for generating features for each
full_df = pd.concat([full_df, test], ignore_index=True, sort=False, keys=cols)
full_df.fillna(0, inplace=True)

In [12]:
# Merging all tables with full_df for getting features that was generated before
full_df = pd.merge(full_df, shops, on=['shop_id'], how='left')
full_df = pd.merge(full_df, items, on=['item_id'], how='left')
full_df = pd.merge(full_df, cats, on=['item_category_id'], how='left')

In [13]:
# Visualization number of sales in each category
# Here we can see distribution of sales
group = full_df.copy().astype(int).groupby('item_category_id')['item_id'].count().sort_values().reset_index()
group.columns = ['category_id', 'count']
group['category_id'] = group['category_id'].astype(str)
fig = px.bar(group, x='category_id', y='count', title='Popularity of different categories', width=1100)
fig.show()

In [14]:
# Visualization number of seles in each shop
group = full_df.copy().astype(int).groupby('shop_id')['item_cnt_month'].sum().sort_values().reset_index()
group.columns = ['shop_id', 'sales']
group['shop_id'] = group['shop_id'].astype(str)
fig = px.bar(group, x='shop_id', y='sales', title='Number of sales in shops', width=1100)
fig.show()

In [15]:
# Function that adds lag to atribute col
def add_lag_of_feature(df, col):
    shifted = df[['date_block_num','shop_id','item_id',col]].copy()
    shifted.columns = ['date_block_num','shop_id','item_id', col + '_lag']
    shifted['date_block_num'] += 1
    df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [16]:
# Adding lag for target variable
full_df = add_lag_of_feature(full_df, 'item_cnt_month')

# Adding lag for mean target variable
group = full_df.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_item_avg_cnt']
group.reset_index(inplace=True)
full_df = pd.merge(full_df, group, on=['date_block_num','item_id'], how='left')
full_df = add_lag_of_feature(full_df, 'date_item_avg_cnt')

# Adding lag for mean target variable in each subtype of items
group = full_df.groupby(['date_block_num', 'shop_id', 'subtype_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_subtype_avg_cnt']
group.reset_index(inplace=True)
full_df = pd.merge(full_df, group, on=['date_block_num', 'shop_id', 'subtype_id'], how='left')
full_df = add_lag_of_feature(full_df, 'date_shop_subtype_avg_cnt')

# Adding lag for mean target variable in each category
group = full_df.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_cat_avg_cnt']
group.reset_index(inplace=True)
full_df = pd.merge(full_df, group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
full_df = add_lag_of_feature(full_df, 'date_shop_cat_avg_cnt')

full_df.drop(['date_shop_cat_avg_cnt', 'date_item_avg_cnt', 'date_shop_subtype_avg_cnt'], axis=1, inplace=True)

# Adding features month, number of days from the first sale and first sale in particular shop
full_df['month'] = full_df['date_block_num'] % 12
full_df['item_shop_first_sale'] = full_df['date_block_num'] - full_df.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
full_df['item_first_sale'] = full_df['date_block_num'] - full_df.groupby('item_id')['date_block_num'].transform('min')

In [17]:
# First month contains many missing values, so it is better to get rid of it 
full_df = full_df[full_df.date_block_num > 1]
full_df.fillna(0, inplace=True)

In [18]:
# Saving all data in pickle format
full_df.to_pickle('data.pkl')

In [19]:
# Loading data from pickle format
data = pd.read_pickle('data.pkl')

In [20]:
# Spliting data into train, validation and test set
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [21]:
# Loading model (instead of training it from the beginning)
model = pickle.load(open('model.sav', 'rb'))

In [22]:
# Defining model
# I tried RandomForestRegressor with different parameters of max_depth (from 3 to 9), and in my opinion 7 is the most optimal
# Another parameters of this model didn't improve the result

model = RandomForestRegressor(max_depth=7, random_state=42, n_jobs=-1)
start = time.time()
model.fit(X_train, Y_train)
time.time() - start

1554.7694432735443

In [23]:
# Saving model
pickle.dump(model, open('model.sav', 'wb'))

In [24]:
# Evaluating model on validation set
# RMSE is 1.05 on validation set and 0.93 on privat test data
mean_squared_error(Y_valid, np.sqrt(model.predict(X_valid)))

1.0575363382863245

In [25]:
# visualizing feature importance
importances = model.feature_importances_
indices = np.argsort(importances)
fig = px.bar(x=importances[indices], y=X_test.columns[indices], width=1000, height=500)
fig.show()

In [26]:
# Creating submission.csv
Y_test = model.predict(X_test)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('submission.csv', index=False)