In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime, date

In [2]:
import os
print(os.listdir("./all"))

['items.csv', 'sample_submission.csv', 'train.csv', 'shops.csv', 'item_categories.csv', 'test.csv']


In [3]:
train_data = pd.read_csv("./all/train.csv")
test_data = pd.read_csv("./all/test.csv")

In [4]:
train_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,ID
0,28.12.2014,23,25,12179,959.0,1.0,2317582
1,23.04.2014,15,25,14042,149.0,1.0,1576962
2,03.03.2014,14,12,11373,106.615385,13.0,1547382
3,08.12.2013,11,47,12107,599.0,1.0,1255724
4,23.01.2015,24,29,3341,2599.0,1.0,2372492


In [5]:
test_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,ID
0,06.02.2015,25,31,11208,699.0,2472142
1,21.11.2013,10,31,19525,149.0,1044690
2,03.01.2015,24,42,16169,299.0,2367559
3,18.07.2014,18,42,2737,199.0,1793501
4,13.04.2014,15,2,15229,1199.0,1585639


In [6]:
train_data.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,ID
count,2925849.0,2925849.0,2925849.0,2925849.0,2925849.0,2925849.0
mean,14.56971,33.00144,10197.24,890.7781,1.24265,1467904.0
std,9.423075,16.22739,6324.133,1729.5,2.620965,847516.7
min,0.0,0.0,0.0,-1.0,-22.0,0.0
25%,7.0,22.0,4476.0,249.0,1.0,733930.0
50%,14.0,31.0,9343.0,399.0,1.0,1467866.0
75%,23.0,47.0,15684.0,999.0,1.0,2201902.0
max,33.0,59.0,22169.0,307980.0,2169.0,2935848.0


In [7]:
train_data.columns

Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day', 'ID'],
      dtype='object')

In [8]:
print(train_data.shape)
print(test_data.shape)
test_data.columns

(2925849, 7)
(10000, 6)


Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price', 'ID'], dtype='object')

In [9]:
test_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,ID
0,06.02.2015,25,31,11208,699.0,2472142
1,21.11.2013,10,31,19525,149.0,1044690
2,03.01.2015,24,42,16169,299.0,2367559
3,18.07.2014,18,42,2737,199.0,1793501
4,13.04.2014,15,2,15229,1199.0,1585639


In [34]:
train_data.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
ID                0
dtype: int64

In [35]:
test_data.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
ID                0
dtype: int64

In [36]:
train_data.select_dtypes(exclude='object').columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day',
       'ID'],
      dtype='object')

In [38]:
Y_train = train_data.item_cnt_day
Y_train.head()

0     1.0
1     1.0
2    13.0
3     1.0
4     1.0
Name: item_cnt_day, dtype: float64

In [39]:
train_data.drop(['item_cnt_day','date'],axis=1, inplace=True)
test_data.drop(['date'],axis=1, inplace=True)

In [40]:
test_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'ID'], dtype='object')

In [41]:
train_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'ID'], dtype='object')

In [42]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split, KFold

In [43]:
n_folds = 5

def rmse_cv(model):
    kfold = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_data.values)
    rmse = np.sqrt(-cross_val_score(model, train_data.values, Y_train, scoring='neg_mean_squared_error', cv=kfold))
    return rmse

In [44]:
#xgb_model = XGBRegressor(learning_rate=.05, random_state=9, n_estimators=2500)
"""xgb_model = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)"""
xgb_model = XGBRegressor()

In [None]:
score = rmse_cv(xgb_model)
print("XGBoost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [45]:
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y,y_pred))

In [46]:
xgb_model.fit(train_data, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [47]:
xgb_train_pred = xgb_model.predict(train_data)
print(rmse(Y_train, xgb_train_pred))

2.1432067605241527


In [48]:
#xgb_pred = np.expm1(xgb_model.predict(test_data))
xgb_pred = xgb_model.predict(test_data)

In [49]:
output = pd.DataFrame({'ID':test_data.ID, 'item_cnt_day' : xgb_pred})
output.to_csv('future_sales_prediction.csv', index=False)

In [50]:
p = pd.read_csv('future_sales_prediction.csv')
p.head()

Unnamed: 0,ID,item_cnt_day
0,2472142,1.223237
1,1044690,1.129992
2,2367559,1.076268
3,1793501,1.100224
4,1585639,1.102048


In [52]:
p.count()

ID              10000
item_cnt_day    10000
dtype: int64

In [53]:
sam = pd.read_csv("./all/sample_submission.csv")
sam.head()

Unnamed: 0,ID,item_cnt_day
0,2472142,0
1,1044690,0
2,2367559,0
3,1793501,0
4,1585639,0


In [54]:
y1 = sam.item_cnt_day
y2 = xgb_pred
print(rmse(y1,y2))

1.3803870419886668
