In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb

%matplotlib inline

In [2]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

In [3]:
test['date_block_num'] = train['date_block_num'].max()+1

In [4]:
test = test[['date_block_num', 'shop_id', 'item_id']]

In [5]:
item_price = train.groupby(['item_id'])['item_price'].agg(['mean', 'count', 'min', 'max', 'std'])

In [6]:
item_price['diff'] = item_price['max']-item_price['min']
item_price.head()

Unnamed: 0_level_0,mean,count,min,max,std,diff
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,58.0,1,58.0,58.0,,0.0
1,4490.0,6,4490.0,4490.0,0.0,0.0
2,58.0,2,58.0,58.0,0.0,0.0
3,79.0,2,58.0,100.0,29.698485,42.0
4,58.0,1,58.0,58.0,,0.0


In [7]:
item_price.sort_values(by=['std', 'count'], axis=0, ascending=False, inplace=True)

In [9]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [8]:
x['date'] = pd.to_datetime(x['date'], format='%d.%m.%Y')

NameError: name 'x' is not defined

In [None]:
x.sort_values(by=['date'], axis=0, inplace=True, ascending=True)

In [None]:
plt.figure(figsize=(14, 7))
plt.plot(x.date, x.item_price)

### Join train and test

In [None]:
x = train.groupby(['date_block_num', 'shop_id', 'item_id'])[['item_cnt_day', 'item_price']].agg(['sum', 'mean', 'std', 'min', 'max'])

In [None]:
x = x[[('item_cnt_day', 'sum'), ('item_price', 'mean'), ('item_price', 'std')]]

In [None]:
x.fillna(0, inplace=True)

In [None]:
x.columns = ['cnt_sum', 'price_mean', 'price_std']

In [None]:
x = x.reset_index()

In [None]:
x = pd.concat((x, test))

In [None]:
x = x[['date_block_num', 'shop_id', 'item_id', 'price_mean', 'price_std', 'cnt_sum']]

In [None]:
x = pd.merge(x, shop, how='left')

In [None]:
x = pd.merge(x, item, how='left')

In [None]:
x.head()

In [None]:
x.tail()

### Sales (counts) per City per Month

In [None]:
u = x.groupby(['Type', 'date_block_num'])['cnt_sum'].sum()

In [None]:
for t in u.index.levels[0]:
    plt.plot(u[t].index, u[t], label=t)
plt.legend()

In [None]:
u = x.groupby(['City', 'date_block_num'])['cnt_sum'].sum()
plt.figure(figsize=(14, 7))
for t in u.index.levels[0]:
    plt.plot(u[t].index, u[t], label=t)
plt.legend(bbox_to_anchor=[1.1, 1])

In [None]:
le = LabelEncoder()

In [None]:
x.drop(['Name', 'item_name_translated', 'price_mean', 'price_std'], axis=1, inplace=True)

In [None]:
x['City'] = le.fit_transform(x['City'])
x['Type'] = le.fit_transform(x['Type'])
x['item_cat1'] = le.fit_transform(x['item_cat1'])

In [None]:
x['item_cat2'] = x['item_cat2'].apply(lambda x: str(x))

In [None]:
x['item_cat2'] = le.fit_transform(x['item_cat2'])

In [None]:
cols = ['City', 'Type', 'item_cat1', 'item_cat2']
for c in cols:
    freq = x[c].value_counts()/x[c].value_counts().sum()
    x[c+'_freq'] = x[c].map(freq)
    rank = pd.DataFrame(freq)
    rank[c] = ss.rankdata(freq)
    rank = rank.to_dict()[c]
    x[c+'_rank'] = x[c].map(rank)

In [None]:
x.head()

In [None]:
cnt = x.groupby('date_block_num')['cnt_sum'].sum()

In [None]:
plt.figure(figsize=(14, 7))
plt.plot(np.arange(1, 13), cnt[:12], label=2013)
plt.plot(np.arange(1, 13), cnt[12:24], label=2014)
plt.plot(np.arange(1, 11), cnt[24:-1], label=2015)
plt.legend()

In [None]:
x['month'] = x['date_block_num']%12

In [None]:
x.head()

In [None]:
# number of zero (shop, item) pairs every month
cnt0 = pd.Series(x[x['cnt_sum']==0].groupby('date_block_num')['cnt_sum'].count())
cnt0[34] = 0
x['cnt_zero_-1m'] = x['date_block_num'].map(cnt0.shift(1))

In [None]:
# u = x[['date_block_num', 'shop_id', 'item_id', 'cnt_sum']].sort_values(['shop_id', 'item_id', 'date_block_num'])

In [None]:
param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}

progress = dict()
xgbtrain = xgb.DMatrix(train_cleaned_df.iloc[:,  (train_cleaned_df.columns != 33)].values, train_cleaned_df.iloc[:, train_cleaned_df.columns == 33].values)
watchlist  = [(xgbtrain,'train-rmse')]

bst = xgb.train(param, xgbtrain)
preds = bst.predict(xgb.DMatrix(train_cleaned_df.iloc[:,  (train_cleaned_df.columns != 33)].values))
from sklearn.metrics import mean_squared_error 
rmse = np.sqrt(mean_squared_error(preds,train_cleaned_df.iloc[:, train_cleaned_df.columns == 33].values))
print(rmse)