In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Basic data loading
path = "data/"
df_train = pd.read_csv(path + 'sales_train.csv')
df_test = pd.read_csv(path + 'test.csv', index_col='ID')
df_shops = pd.read_csv(path + 'shops.csv', index_col='shop_id')
df_items = pd.read_csv(path + 'items.csv', index_col='item_id')
df_itemcat = pd.read_csv(path + 'item_categories.csv', index_col='item_category_id')
sample_submission = pd.read_csv(path + 'sample_submission.csv', index_col='ID')

In [2]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


In [3]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   shop_id  214200 non-null  int64
 1   item_id  214200 non-null  int64
dtypes: int64(2)
memory usage: 4.9 MB


In [4]:
df_train.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [5]:
df_test.isnull().sum()

shop_id    0
item_id    0
dtype: int64

In [6]:
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [7]:
from datetime import datetime

def date_processing(df):
    df["date"] = df["date"].apply(lambda date_str : datetime.strptime(date_str, "%d.%m.%Y"))

    df["year"] = df["date"].apply(lambda date : date.year)
    df["month"] = df["date"].apply(lambda date : date.month)
    df["day"] = df["date"].apply(lambda date : date.day)
    df = df.drop(columns=["date"])
    return df

In [8]:
df_train = date_processing(df_train)

In [9]:
df_train

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,year,month,day
0,0,59,22154,999.00,1.0,2013,1,2
1,0,25,2552,899.00,1.0,2013,1,3
2,0,25,2552,899.00,-1.0,2013,1,5
3,0,25,2554,1709.05,1.0,2013,1,6
4,0,25,2555,1099.00,1.0,2013,1,15
...,...,...,...,...,...,...,...,...
2935844,33,25,7409,299.00,1.0,2015,10,10
2935845,33,25,7460,299.00,1.0,2015,10,9
2935846,33,25,7459,349.00,1.0,2015,10,14
2935847,33,25,7440,299.00,1.0,2015,10,22


In [10]:
pd.pivot_table(df_train, index=["shop_id", "item_id"],
               values="item_cnt_day", columns="day",
              fill_value=0, aggfunc="sum")

Unnamed: 0_level_0,day,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,30,0,0,0,0,0,0,0,0,0,0,...,2,3,0,0,3,0,0,0,0,0
0,31,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,0,0,0,0,0
0,32,0,0,2,0,1,1,0,0,0,0,...,2,0,0,2,0,0,0,0,0,1
0,33,0,0,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,0,0
0,35,1,2,0,1,0,2,0,2,1,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,22154,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,22155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,22162,1,0,0,0,0,0,1,0,1,0,...,2,0,1,0,1,0,0,1,0,0
59,22164,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0


In [11]:
df = pd.pivot_table(df_train, index=["shop_id", "item_id"],
               values="item_cnt_day", columns=['date_block_num'],
              fill_value=0, aggfunc="sum").reset_index()

In [12]:
df

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424119,59,22154,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424120,59,22155,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
424121,59,22162,0,0,0,0,0,0,0,0,...,0,9,4,1,1,0,0,1,0,0
424122,59,22164,0,0,0,0,0,0,0,0,...,0,2,1,2,0,0,1,0,0,0


In [13]:
df_merge = pd.merge(df_test, df, on=["shop_id", "item_id"], how="left")
df_merge = df_merge.fillna(0)
df_merge = df_merge.drop(columns=["shop_id", "item_id"])
df_merge

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
214196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214197,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
X_train = df_merge.iloc[:,:-1]#0~32번째 달의 판매량
y_train = df_merge.iloc[:,-1]#33번째 달의 실제 판매량
X_test = df_merge.iloc[:,1:]#1~33번째 달의 실제 판매량들 
print("X_train : ", X_train.shape)
print("y_train : ", y_train.shape)
print("X_test : ", X_test.shape)

X_train :  (214200, 33)
y_train :  (214200,)
X_test :  (214200, 33)


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

random_state = 200
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train,
                                                                         random_state=random_state,
                                                                         test_size=0.2)
print("X_train_split : ", X_train_split.shape)
print("X_val_split : ", X_val_split.shape)
print("y_train_split : ", y_train_split.shape)
print("y_val_split : ", y_val_split.shape)

X_train_split :  (171360, 33)
X_val_split :  (42840, 33)
y_train_split :  (171360,)
y_val_split :  (42840,)


In [16]:
def cnt_predict(model, X_train, y_train, kind="default", print_len=7):
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train,
                                                                         random_state=random_state,
                                                                         test_size=0.2)
    model.fit(X_train_split, y_train_split)
    y_pred_train = model.predict(X_train_split)
    model_name = model.__class__.__name__
    print("#### " +model_name+" month prediction("+kind+") ####")
    print("y_train[:{}] : {}".format(print_len, np.round(y_train_split[:print_len].values,1)))
    print("y_pred_train[:{}] : {}".format(print_len, np.round(y_pred_train[:print_len],1)))
    print()
    y_pred_val = model.predict(X_val_split)
    print("y_val[:{}] : {}".format(print_len, np.round(y_val_split[:print_len].values,1)))
    print("y_pred_val[:{}] : {}".format(print_len, np.round(y_pred_val[:print_len],1)))
    rmse = np.sqrt(mean_squared_error(y_pred_val, y_val_split))
    print("rmse : {:.6f}".format(rmse))
    print("----------\n")
    return model

In [17]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(random_state=random_state)
rf_regressor = cnt_predict(rf_regressor, X_train, y_train)

#### RandomForestRegressor month prediction(default) ####
y_train[:7] : [0. 1. 0. 0. 1. 0. 0.]
y_pred_train[:7] : [0.1 0.1 0.1 0.4 0.8 0.1 0.2]

y_val[:7] : [0. 0. 0. 0. 0. 0. 0.]
y_pred_val[:7] : [0.4 0.1 0.1 0.1 0.1 0.1 0.1]
rmse : 10.525327
----------



In [18]:
from lightgbm import LGBMRegressor

lgbm_regressor = LGBMRegressor(random_state=random_state)
lgbm_regressor = cnt_predict(lgbm_regressor, X_train, y_train)

#### LGBMRegressor month prediction(default) ####
y_train[:7] : [0. 1. 0. 0. 1. 0. 0.]
y_pred_train[:7] : [0.1 0.1 0.1 0.4 0.2 0.1 0.8]

y_val[:7] : [0. 0. 0. 0. 0. 0. 0.]
y_pred_val[:7] : [0.3 0.1 0.1 0.1 0.1 0.1 0.1]
rmse : 10.510442
----------



In [19]:
from xgboost import XGBRegressor

xgb_regressor = XGBRegressor(random_state=random_state)
xgb_regressor = cnt_predict(xgb_regressor, X_train, y_train)

#### XGBRegressor month prediction(default) ####
y_train[:7] : [0. 1. 0. 0. 1. 0. 0.]
y_pred_train[:7] : [0.1 0.1 0.1 0.4 0.2 0.1 0.9]

y_val[:7] : [0. 0. 0. 0. 0. 0. 0.]
y_pred_val[:7] : [0.4 0.1 0.1 0.1 0.1 0.1 0.1]
rmse : 10.322351
----------



In [41]:
def average_model_evaluation(models, X_train, y_train, kind="default", print_len=7):
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train.values, y_train.values,
                                                                         random_state=random_state,
                                                                         test_size=0.2)
    pred_vals = []
    print("####### average model #######\n\n")
    for model in models:
        model.fit(X_train_split, y_train_split)
        y_pred_train = model.predict(X_train_split)
        model_name = model.__class__.__name__
        print("#### " +model_name+"  prediction("+kind+") ####")
        print("y_train[:{}] : {}".format(print_len, np.round(y_train_split[:print_len],1)))
        print("y_pred_train[:{}] : {}".format(print_len, np.round(y_pred_train[:print_len],1)))
        print()
        y_pred_val = model.predict(X_val_split)
        pred_vals.append(y_pred_val)
        print("y_val[:{}] : {}".format(print_len, np.round(y_val_split[:print_len],1)))
        print("y_pred_val[:{}] : {}".format(print_len, np.round(y_pred_val[:print_len],1)))
        rmse = np.sqrt(mean_squared_error(y_pred_val, y_val_split))
        print("rmse : {:.6f}".format(rmse))
        print("----------\n")
    print("seperated model evaluation ended\n")
    print("### average model evaluation ###")
    y_pred = np.mean(pred_vals, axis=0)
    print("average model validation predictions : ", y_pred)
    rmse = np.sqrt(mean_squared_error(y_pred, y_val_split))
    print("average model rmse : {:.6f}".format(rmse))
    print("-------------------------------")
    return models, pred_vals

def average_model_prediction(models, X_test, kind="default", print_len=20):
    pred_tests = []
    print("####### average model #######\n\n")
    for model in models:
        model_name = model.__class__.__name__
        print("#### " +model_name+" prediction("+kind+") ####")
        print(X_test.shape)
        y_pred_test = model.predict(X_test.values)
        print("y_pred_test[:{}] : {}".format(print_len, np.round(y_pred_test[:print_len],1)))
        pred_tests.append(y_pred_test)

    y_pred = np.mean(pred_tests, axis=0)
    return y_pred

In [38]:
rf_regressor = RandomForestRegressor(random_state=random_state)
lgbm_regressor = LGBMRegressor(random_state=random_state)
xgb_regressor = XGBRegressor(random_state=random_state)

default_models = [rf_regressor, lgbm_regressor, xgb_regressor]
default_models, pred_vals = average_model_evaluation(default_models, X_train, y_train)

####### average model #######


#### RandomForestRegressor  prediction(default) ####
y_train[:7] : [0. 1. 0. 0. 1. 0. 0.]
y_pred_train[:7] : [0.1 0.1 0.1 0.4 0.8 0.1 0.2]

y_val[:7] : [0. 0. 0. 0. 0. 0. 0.]
y_pred_val[:7] : [0.4 0.1 0.1 0.1 0.1 0.1 0.1]
rmse : 10.525327
----------

#### LGBMRegressor  prediction(default) ####
y_train[:7] : [0. 1. 0. 0. 1. 0. 0.]
y_pred_train[:7] : [0.1 0.1 0.1 0.4 0.2 0.1 0.8]

y_val[:7] : [0. 0. 0. 0. 0. 0. 0.]
y_pred_val[:7] : [0.3 0.1 0.1 0.1 0.1 0.1 0.1]
rmse : 10.510442
----------

#### XGBRegressor  prediction(default) ####
y_train[:7] : [0. 1. 0. 0. 1. 0. 0.]
y_pred_train[:7] : [0.1 0.1 0.1 0.4 0.2 0.1 0.9]

y_val[:7] : [0. 0. 0. 0. 0. 0. 0.]
y_pred_val[:7] : [0.4 0.1 0.1 0.1 0.1 0.1 0.1]
rmse : 10.322351
----------

seperated model evaluation ended

### average model evaluation ###
average model validation predictions :  [0.34460424 0.10219104 0.10650432 ... 0.14587304 0.12487024 0.24563038]
average model rmse : 10.438851
----------------------

In [39]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
214196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214197,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
y_pred = average_model_prediction(default_models, X_test)
print(y_pred)

####### average model #######


#### RandomForestRegressor prediction(default) ####
(214200, 33)
y_pred_test[:20] : [0.4 0.1 0.4 0.1 0.1 0.8 1.1 0.6 0.7 0.1 3.3 0.2 0.2 0.3 1.7 3.5 0.1 0.5
 1.7 0.1]
#### LGBMRegressor prediction(default) ####
(214200, 33)
y_pred_test[:20] : [0.5 0.1 0.7 0.2 0.1 0.5 0.9 0.1 1.3 0.1 3.7 0.2 0.1 0.3 1.5 4.5 0.1 0.1
 1.5 0.1]
#### XGBRegressor prediction(default) ####
(214200, 33)
y_pred_test[:20] : [0.6 0.1 0.6 0.2 0.1 0.5 0.9 0.1 1.1 0.1 4.7 0.2 0.1 0.3 1.8 6.  0.1 0.1
 1.1 0.1]
[0.48811457 0.12487024 0.5661169  ... 0.13379021 0.12487024 0.12238952]


In [47]:
sample_submission["item_cnt_month"] = y_pred

In [48]:
sample_submission

Unnamed: 0_level_0,item_cnt_month
ID,Unnamed: 1_level_1
0,0.488115
1,0.124870
2,0.566117
3,0.153360
4,0.124870
...,...
214195,0.333773
214196,0.124870
214197,0.133790
214198,0.124870


In [None]:
sample_submission.to_csv("")

In [49]:
datasets = [X_train_split, X_val_split]
smaller_datasets = []
for dataset in datasets:
    smaller_datasets.append(dataset.iloc[:int(dataset.shape[0]/10),:])
    
datasets = [y_train_split, y_val_split]
for dataset in datasets:
    smaller_datasets.append(dataset.iloc[:int(dataset.shape[0]/10)])

In [50]:
from sklearn.model_selection import 
param_grid = {
    "max_depth" : [5, 10, 20, None],
    "min_samples_split":[1, 2, 4, 8],
    "min_samples_leaf" : [1, 2, 4, 8]
}

[         0    1    2    3    4    5    6    7    8    9   ...   23   24   25  \
 206514  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 93693   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 98210   0.0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  0.0  0.0  ...  0.0  0.0  1.0   
 83218   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 135467  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  1.0   
 ...     ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
 131218  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0  0.0   
 103055  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  8.0  2.0  3.0   
 153647  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 23673   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 131058  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 
          26   27   28   2