In [1]:
#import required libraries
import pandas as pd
import numpy as np  

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

In [2]:
#Data Acquisition

In [3]:
categories = pd.read_csv("categories.csv")
categories.head()

Unnamed: 0,category_name,category_id
0,PC - Headsets / Headphones,0
1,Accessories - PS2,1
2,Accessories - PS3,2
3,Accessories - PS4,3
4,Accessories - PSP,4


In [4]:
sales_train = pd.read_csv("sales_train.csv")
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [5]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [6]:
shops= pd.read_csv("shops.csv")
shops.head()

Unnamed: 0,shop_name,shop_id
0,"! Yakutsk Ordzhonikidze, 56 francs",0
1,"! Yakutsk TC ""Central"" fran",1
2,"Adygea TC ""Mega""",2
3,"Balashikha TC ""Oktyabr-Kinomir""",3
4,"Volga TC ""Volga Mall""",4


In [7]:
items= pd.read_csv("items.csv")
items.head()

Unnamed: 0,item_id,category_id,item_name
0,0,40,!! IN THE POWER OF HAPPINESS (PLAST) D
1,1,76,! ABBYY FineReader 12 Professional Edition Ful...
2,2,40,*** IN THE GLORY OF THE GLORY (UNV) D
3,3,40,*** BLUE WAVE (Univ) D
4,4,40,*** BOX (GLASS) D


In [8]:
#Data Preprocessing

In [9]:
sales_train.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [10]:
sales_train.drop(['date_block_num','item_price'], axis=1, inplace=True)

In [11]:
sales_train['date'] = pd.to_datetime(sales_train['date'], dayfirst=True)
sales_train.head()

Unnamed: 0,date,shop_id,item_id,item_cnt_day
0,2013-01-02,59,22154,1.0
1,2013-01-03,25,2552,1.0
2,2013-01-05,25,2552,-1.0
3,2013-01-06,25,2554,1.0
4,2013-01-15,25,2555,1.0


In [12]:
sales_train['date'] = sales_train['date'].apply(lambda x: x.strftime('%Y-%m'))
sales_train.head()

Unnamed: 0,date,shop_id,item_id,item_cnt_day
0,2013-01,59,22154,1.0
1,2013-01,25,2552,1.0
2,2013-01,25,2552,-1.0
3,2013-01,25,2554,1.0
4,2013-01,25,2555,1.0


In [13]:
df = sales_train.groupby(['date','shop_id','item_id']).sum()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_day
date,shop_id,item_id,Unnamed: 3_level_1
2013-01,0,32,6.0
2013-01,0,33,3.0
2013-01,0,35,1.0
2013-01,0,43,1.0
2013-01,0,51,2.0


In [14]:
df = df.pivot_table(index=['shop_id','item_id'], columns='date', values='item_cnt_day', fill_value=0)
df.reset_index(inplace=True)
df.head()

date,shop_id,item_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
test = pd.merge(test, df, on=['shop_id','item_id'], how='left')
test.drop(['ID', '2013-01'], axis=1, inplace=True)
test = test.fillna(0)
test.head()

Unnamed: 0,shop_id,item_id,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5,5268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
print(test.shape,df.shape)

(214200, 35) (424124, 36)


In [17]:
#Model Development

In [18]:
Y_train = df['2015-10'].values
X_train = df.drop(['2015-10'], axis = 1)
X_test = test

print(X_train.shape, Y_train.shape)
print(X_test.shape)

(424124, 35) (424124,)
(214200, 35)


In [19]:
x_train, x_test, y_train, y_test = train_test_split( X_train, Y_train, test_size=0.2, random_state=4)
print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)

Train set: (339299, 35) (339299,)
Test set: (84825, 35) (84825,)


In [20]:
LR = LinearRegression()
LR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, LR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, LR.predict(x_test)))
print('Test set score:', LR.score(x_train,y_train))

Train set mse: 10.858441769616823
Test set mse: 5.9239469009304155
Test set score: 0.44407831810469345


In [21]:
RFR = RandomForestRegressor(n_estimators = 100)
RFR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, RFR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, RFR.predict(x_test)))
print('Test set score:', RFR.score(x_train,y_train))

Train set mse: 1.6066838508218417
Test set mse: 1.0861894076038903
Test set score: 0.9177423052428981


In [22]:
prediction = RFR.predict(X_test)

In [23]:
prediction = list(map(round, prediction))

In [24]:
#Submission

In [25]:
submission = pd.read_csv('sample_submission.csv')
print(submission.shape)
submission.head()

(214200, 2)


Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [26]:
submission['item_cnt_month'] = prediction
submission.to_csv('prediction.csv', index=False)
submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0
1,1,1
2,2,1
3,3,0
4,4,1
