In [1]:
# Modules import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn import metrics

# Setting
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [2]:
# Import data
sales_train        = pd.read_csv('./readonly/sales_train.csv')
test               = pd.read_csv('./readonly/test.csv')
items              = pd.read_csv('./readonly/items.csv')
item_categories    = pd.read_csv('./readonly/item_categories.csv')
shops              = pd.read_csv('./readonly/shops.csv')

In [3]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
# Hyperparametres
items_sample_size = 50

In [5]:
# Include items and shops only from test
test_shops = test['shop_id'].unique()
test_items = test['item_id'].unique()

data = sales_train[
    sales_train['shop_id'].isin(test_shops) & 
    sales_train['item_id'].isin(test_items)].copy()

# Collect item prices (taking into account that prices should not differ much from shop to shop)
prices = sales_train.groupby(['item_id'])[['item_price']].mean()
data.drop('item_price', axis = 1, inplace = True)

# Drop column "date", which is not going to be used
data.drop('date', axis = 1, inplace = True)

# Get rid of mistakes in selection
data = data[data['item_cnt_day'] >= 0]

# If an item is not sold, our model should predict this fact
sp = []
for item in np.random.choice(test_items, items_sample_size, replace=False):
    for date_block in range(34):
        for shop in test_shops:
            sp.append([date_block, shop, item, 0])
zero_sales = pd.DataFrame(sp, columns = data.columns)
data = pd.concat([data, zero_sales])

# Making "target" values
data = data.groupby(["date_block_num", "shop_id", "item_id"])['item_cnt_day'].sum().reset_index()
data.rename(columns = {'item_cnt_day' : 'target'}, inplace=True)
data.loc[:, ('target')] = data['target'].astype('int32')

# Add categories' info
data = data.join(items.set_index('item_id')['item_category_id'], on='item_id')
data = data.join(prices, on=['item_id'])

# Change test to a format of "data" variable
build_test = test.join(items.set_index('item_id')['item_category_id'], on='item_id')
build_test = build_test.join(prices, on='item_id')
build_test.drop('ID', axis=1, inplace=True)

In [6]:
data.drop('date_block_num', axis=1, inplace=True)

Let's conider the following format of data (conditions for the work of the code below): \
**Columns** : shop_id - item_id - item_category_id - item_price \
shop_id, item_id, item_category_id can't have NaNs \
item_price can have NaNs

In [42]:
# This function makes features which depend on a part of known dataset, such as mean values
# Also it doesn't matter if "test_input" has "target" column (for validation) or not (for the real test)
def preparation(data_input, test_input, mean_encoding, indexes_union_for_dummies):  
    data_train = data_input.copy()
    data_train.loc[:, 'item_price'].fillna(data_train['item_price'].mean(), inplace=True)

    data_test = test_input.copy()
    data_test.loc[:, 'item_price'].fillna(data_train['item_price'].mean(), inplace=True)
    
    # Mean encoding
    for column in mean_encoding:
        collect_means = data_train.groupby(column)[['target']].mean()
        collect_means.rename(columns = {'target' : 'mean_' + column}, inplace=True)

        data_train = data_train.join(collect_means, on=column)
        data_test = data_test.join(collect_means, on=column)
        data_test = data_test.fillna(method='ffill', axis=1)
       
    # One Hot encoding
    for column in list(indexes_union_for_dummies.keys()):
        ids_union = indexes_union_for_dummies[column]
        data_train.loc[:, column] = data_train[column].astype(pd.CategoricalDtype(categories=ids_union))
        data_test.loc[:, column] = data_test[column].astype(pd.CategoricalDtype(categories=ids_union))

        data_train = data_train.join(pd.get_dummies(data_train[column], prefix=column))
        data_test = data_test.join(pd.get_dummies(data_test[column], prefix=column))

        data_train.drop(column, axis=1, inplace=True)
        data_test.drop(column, axis=1, inplace=True)
        
    return data_train, data_test

#### Approach 0: Linear Regression

In [56]:
kf5 = KFold(n_splits = 5, shuffle=True)
indexes_union_for_dummies = dict()
mean_encoding =  []

for train_index, test_index in kf5.split(data):
    data_train, data_test = preparation(data.loc[train_index], data.loc[test_index], mean_encoding, indexes_union_for_dummies)
    
    X_train, X_test = data_train.drop('target', axis = 1), data_test.drop('target', axis = 1)
    y_train, y_test = data_train['target'].astype('float32'), data_test['target'].astype('float32')
    
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)
    print(metrics.mean_squared_error(y_test, lr.predict(X_test)))

104.9424183467713
109.95931521177557
142.80673025987474
113.93789702529644
104.41515470788808


#### Approach 1 : Mean encoding, Linear Regression

In [57]:
kf5 = KFold(n_splits = 5, shuffle=True)
indexes_union_for_dummies = dict()
mean_encoding =  ['shop_id', 'item_id', 'item_category_id']

for train_index, test_index in kf5.split(data):
    data_train, data_test = preparation(data.loc[train_index], data.loc[test_index], mean_encoding, indexes_union_for_dummies)
    
    X_train, X_test = data_train.drop('target', axis = 1), data_test.drop('target', axis = 1)
    y_train, y_test = data_train['target'].astype('float32'), data_test['target'].astype('float32')
    
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)
    print(metrics.mean_squared_error(y_test, lr.predict(X_test)))

75.9559944797769
79.74611194506095
60.093020590337574
103.83677194208866
59.1070037994563


#### Approach 2 : Mean encoding, One Hot encoding, Linear Regression

In [48]:
kf5 = KFold(n_splits = 5, shuffle=True)
indexes_union_for_dummies = dict()
mean_encoding =  ['shop_id', 'item_id', 'item_category_id']
for column in ['shop_id', 'item_category_id']:
    indexes_union_for_dummies[column] = list(set(build_test[column].unique().tolist()).union(set(data[column].unique().tolist())))

for train_index, test_index in kf5.split(data):
    data_train, data_test = preparation(data.loc[train_index], data.loc[test_index], mean_encoding, indexes_union_for_dummies)
    
    X_train, X_test = data_train.drop('target', axis = 1), data_test.drop('target', axis = 1)
    y_train, y_test = data_train['target'].astype('float32'), data_test['target'].astype('float32')
    
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)
    print(metrics.mean_squared_error(y_test, lr.predict(X_test)))

59.31937031709726
70.72295966829721
58.32318332747794
98.3485151675847
91.13921511935841


So I have chosen approach N

In [54]:
data_train, data_test = preparation(data, build_test, mean_encoding, indexes_union_for_dummies)

lr = linear_model.LinearRegression()
lr.fit(data_train.drop('target', axis = 1), data_train['target'].astype('float32'))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [55]:
submission = pd.DataFrame(lr.predict(data_test))

submission.index.name = 'ID'
submission.columns = ['item_cnt_month']

submission.to_csv('./results/submbission.csv')