[0] import libs

In [90]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

train_file = "data/fc_1302/fc_1302_train.csv"
payload_file = "data/fc_1302/fc_1302_payload.csv"

[1] load train data set 

In [79]:
df = pd.read_csv(train_file, parse_dates=["BKG_DATE"]).set_index("BKG_DATE")
df["y"] = df["ITEM_CD_COUNT"].shift(-1)

df

Unnamed: 0_level_0,INV_AMT,INV_AMT_DIFF,ITEM_AMT,ITEM_AMT_DIFF,weekday,holiday,holiday_diff,ITEM_CD_COUNT,y
BKG_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-03-01,1.0,1.0,1.0,1.0,1,1,0,2190,3822.0
2021-03-02,1.0,1.0,1.0,1.0,0,0,0,3822,1844.0
2021-03-03,1.0,1.0,1.0,1.0,0,0,0,1844,1698.0
2021-03-04,1.0,1.0,1.0,1.0,0,0,0,1698,1501.0
2021-03-05,1.0,1.0,1.0,1.0,0,0,0,1501,1159.0
...,...,...,...,...,...,...,...,...,...
2021-05-27,1.0,1.0,1.0,1.0,0,0,0,2568,2053.0
2021-05-28,1.0,1.0,1.0,1.0,0,0,0,2053,1695.0
2021-05-29,1.0,1.0,1.0,1.0,0,0,0,1695,3588.0
2021-05-30,1.0,1.0,1.0,1.0,0,0,0,3588,3295.0


- train: .8 // test: .2

In [91]:
size = (int)(df.shape[0] * 0.2)
df_train = df[:-size]
df_test = df[-size:]

print(f"shape(df_train): {df_train.shape}")
print(f"shape(df_test): {df_test.shape}")

shape(df_train): (74, 9)
shape(df_test): (18, 9)


[2] buil models 

- Decision Tree Regression
- Gradient Boosting Regressor
- Linear regression model

In [81]:
X_train = df_train.drop(columns={"ITEM_CD_COUNT", "y"}, axis=1)
y_train = df_train["y"].values.reshape(-1,1)


df_test = df_test.drop(df_test.tail(1).index)

X_test = df_test.drop(columns={"ITEM_CD_COUNT", "y"}, axis=1)

df_result = df_test.copy()

In [82]:
# Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)



In [83]:
df_result["dt_pred"] = dt_pred
df_result

Unnamed: 0_level_0,INV_AMT,INV_AMT_DIFF,ITEM_AMT,ITEM_AMT_DIFF,weekday,holiday,holiday_diff,ITEM_CD_COUNT,y,dt_pred
BKG_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-05-14,1.0,1.0,1.0,1.0,0,0,0,2014,1829.0,2652.473684
2021-05-15,1.0,1.0,1.0,1.0,0,0,0,1829,10773.0,2652.473684
2021-05-16,2.0,1.0,2.0,1.0,0,0,0,10773,23466.0,3999.0
2021-05-17,2.0,2.0,2.0,2.0,1,0,0,23466,6529.0,6568.9
2021-05-18,1.0,2.0,1.0,2.0,0,0,1,6529,3636.0,2094.0
2021-05-19,1.0,1.0,1.0,1.0,0,1,0,3636,5437.0,2246.0
2021-05-20,1.0,1.0,1.0,1.0,0,0,0,5437,4284.0,2652.473684
2021-05-21,1.0,1.0,1.0,1.0,0,0,0,4284,3060.0,2652.473684
2021-05-22,1.0,1.0,1.0,1.0,0,0,0,3060,2949.0,2652.473684
2021-05-23,1.0,1.0,1.0,1.0,0,0,0,2949,6211.0,2652.473684


In [84]:
def mape(y_obs, y_pred):
    return round(np.mean(np.abs((y_obs - y_pred) / y_obs)) * 100, 2)


In [85]:
dt_mape = mape(df_result["y"], df_result["dt_pred"])

print(f'Decision Tree: {dt_mape}%')

Decision Tree: 40.56%


In [86]:
# Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor(random_state=42)
gbr_model.fit(X_train, y_train)
gbr_pred  = gbr_model.predict(X_test)

df_result["gbr_pred"] = gbr_pred

In [87]:
gbr_mape = mape(df_result["y"], df_result["gbr_pred"])

print(f'Gradient Boosting Regressor: {gbr_mape}%')

Gradient Boosting Regressor: 40.05%


In [88]:
# Linear regression model
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression(fit_intercept = True)
linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)

df_result["linear_pred"] = linear_pred

In [89]:
linear_mape = mape(df_result["y"], df_result["linear_pred"])

print(f'Linear regression model: {linear_mape}%')

Linear regression model: 44.27%


## 2. Train model using Sagemaker