In [60]:
# Environment Setup
import numpy    as np
import pandas   as pd
import seaborn  as sns
import matplotlib.pyplot as plt
import sklearn  as skl
import datetime

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import linear_model    # LogisticRegression
from sklearn import set_config


## Loading the data

In [61]:
categories = pd.read_csv("categories.csv")
items = pd.read_csv("items.csv")
shops = pd.read_csv("shops.csv")

In [62]:
# Loading datasets
df_train = pd.read_csv('sales_train.csv')
df_test = pd.read_csv('test.csv')
items_cat = pd.read_csv('item_categories.csv')

In [63]:
df_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2935844,10.10.2015,33,25,7409,299.00,1.0
2935845,09.10.2015,33,25,7460,299.00,1.0
2935846,14.10.2015,33,25,7459,349.00,1.0
2935847,22.10.2015,33,25,7440,299.00,1.0


In [66]:
df_test

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268
...,...,...,...
214195,214195,45,18454
214196,214196,45,16188
214197,214197,45,15757
214198,214198,45,19648


In [64]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


In [65]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_train.dtypes

date              datetime64[ns]
date_block_num             int64
shop_id                    int64
item_id                    int64
item_price               float64
item_cnt_day             float64
dtype: object

In [71]:
df_sample = df_train[df_train['date_block_num'] == 33 ]
df_sample

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2882335,2015-10-23,33,45,13315,649.0,1.0
2882336,2015-05-10,33,45,13880,229.0,1.0
2882337,2015-02-10,33,45,13881,659.0,1.0
2882338,2015-12-10,33,45,13881,659.0,1.0
2882339,2015-04-10,33,45,13923,169.0,1.0
...,...,...,...,...,...,...
2935844,2015-10-10,33,25,7409,299.0,1.0
2935845,2015-09-10,33,25,7460,299.0,1.0
2935846,2015-10-14,33,25,7459,349.0,1.0
2935847,2015-10-22,33,25,7440,299.0,1.0


In [83]:
agg_func_count = {'item_cnt_day': ["min", "max", "mean", "count", "sum"]}
df_sample.groupby(['shop_id','item_id']).agg(agg_func_count)
#x.keys()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,count,sum
shop_id,item_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2,31,1.0,1.0,1.0,1,1.0
2,486,1.0,1.0,1.0,3,3.0
2,787,1.0,1.0,1.0,1,1.0
2,794,1.0,1.0,1.0,1,1.0
2,968,1.0,1.0,1.0,1,1.0
...,...,...,...,...,...,...
59,22087,1.0,3.0,2.0,3,6.0
59,22088,1.0,1.0,1.0,2,2.0
59,22091,1.0,1.0,1.0,1,1.0
59,22100,1.0,1.0,1.0,1,1.0


In [88]:
df = df_sample.groupby(['shop_id','item_id'])['item_cnt_day'].sum().reset_index()
#df = df.pivot_table(index=['shop_id','item_id'],values='item_cnt_day',fill_value=0)
#df.reset_index(inplace=True)
df.head()

Unnamed: 0,shop_id,item_id,item_cnt_day
0,2,31,1.0
1,2,486,3.0
2,2,787,1.0
3,2,794,1.0
4,2,968,1.0


In [89]:
results = pd.merge(left = df_test,         # Left table for the join
                   right = df, # Right table for the join
                   on=["shop_id", "item_id"], # Common keys
                   how='left')          # Type of join

results.head()

Unnamed: 0,ID,shop_id,item_id,item_cnt_day
0,0,5,5037,
1,1,5,5320,
2,2,5,5233,1.0
3,3,5,5232,
4,4,5,5268,


In [90]:
print("% of missings:", results.item_cnt_day.isna().sum() / len(results) * 100)

% of missings: 86.61064425770309


In [91]:

results.item_cnt_day = results.item_cnt_day.fillna(0)
results.head()

Unnamed: 0,ID,shop_id,item_id,item_cnt_day
0,0,5,5037,0.0
1,1,5,5320,0.0
2,2,5,5233,1.0
3,3,5,5232,0.0
4,4,5,5268,0.0


In [None]:
results.item_cnt_day = results.item_cnt_day.clip(lower=0, upper=20)
results.head()

In [None]:
sub["item_cnt_month"] = results["item_cnt_day"]
sub.head()

In [None]:
sub.to_csv("oct2015.csv")