In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA

In [2]:
with open("pipeline_preprocessing.pkl", "rb") as file:
    pipeline_preprocessing  = pickle.load(file )

In [3]:
df_train = pd.read_csv('data/train.csv')

In [4]:
df_train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [5]:
X = df_train
y = df_train.sales

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


# baseline model : simple linear regression

In [25]:
pipe_linear = Pipeline([
    ('pipeline_preprocessing', pipeline_preprocessing),
    ('linear_regression', LinearRegression())
], 
    verbose=False)

In [26]:
pipe_linear.fit(X_train,y_train)

Pipeline(steps=[('pipeline_preprocessing',
                 Pipeline(steps=[('transformer_preprocessing',
                                  ColumnTransformer(transformers=[('preprocessing_tranformer',
                                                                   PreprocessingTranformer(),
                                                                   ['date']),
                                                                  ('one_hot_encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False),
                                                                   ['family'])])),
                                 ('standard_scaler', StandardScaler())])),
                ('linear_regression', LinearRegression())])

In [27]:
y_pred = pipe_linear.predict(X_val)
y_pred_positives = np.amax(np.vstack((y_pred, np.zeros(y_pred.size))),0)

In [28]:
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred_positives))
print(f"RMSLE : {rmsle:.2f}")

RMSLE : 2.40


# ARMA Model

In [6]:
X_train_proc = pd.DataFrame(
    pipeline_preprocessing.fit_transform(X_train), 
    columns=pipeline_preprocessing['transformer_preprocessing'].get_feature_names_out()
)

In [7]:
X_train_proc

Unnamed: 0,preprocessing_tranformer__year,preprocessing_tranformer__sin_month,preprocessing_tranformer__cos_month,preprocessing_tranformer__sin_day_of_week,preprocessing_tranformer__cos_day_of_week,one_hot_encoder__family_AUTOMOTIVE,one_hot_encoder__family_BABY CARE,one_hot_encoder__family_BEAUTY,one_hot_encoder__family_BEVERAGES,one_hot_encoder__family_BOOKS,...,one_hot_encoder__family_MAGAZINES,one_hot_encoder__family_MEATS,one_hot_encoder__family_PERSONAL CARE,one_hot_encoder__family_PET SUPPLIES,one_hot_encoder__family_PLAYERS AND ELECTRONICS,one_hot_encoder__family_POULTRY,one_hot_encoder__family_PREPARED FOODS,one_hot_encoder__family_PRODUCE,one_hot_encoder__family_SCHOOL AND OFFICE SUPPLIES,one_hot_encoder__family_SEAFOOD
0,-0.622903,1.356491,0.060327,-1.323361,-0.868793,-0.176825,-0.176844,-0.176751,-0.176551,-0.176907,...,-0.176781,-0.176695,-0.176802,-0.176701,-0.176541,-0.176642,-0.177098,-0.176915,-0.177037,-0.176782
1,0.863126,-1.486979,0.060327,1.321193,-0.868793,-0.176825,-0.176844,-0.176751,-0.176551,-0.176907,...,-0.176781,-0.176695,5.656038,-0.176701,-0.176541,-0.176642,-0.177098,-0.176915,-0.177037,-0.176782
2,1.606141,-0.065244,-1.351987,-1.323361,-0.868793,-0.176825,5.654710,-0.176751,-0.176551,-0.176907,...,-0.176781,-0.176695,-0.176802,-0.176701,-0.176541,-0.176642,-0.177098,-0.176915,-0.177037,-0.176782
3,-0.622903,1.166015,-0.645830,-0.001084,1.153472,-0.176825,-0.176844,-0.176751,5.664070,-0.176907,...,-0.176781,-0.176695,-0.176802,-0.176701,-0.176541,-0.176642,-0.177098,-0.176915,-0.177037,-0.176782
4,1.606141,-0.065244,-1.351987,-1.323361,-0.868793,-0.176825,-0.176844,-0.176751,-0.176551,-0.176907,...,-0.176781,-0.176695,-0.176802,-0.176701,5.664392,-0.176642,-0.177098,-0.176915,-0.177037,-0.176782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2100616,-0.622903,-0.065244,1.472641,1.321193,-0.868793,-0.176825,-0.176844,-0.176751,-0.176551,-0.176907,...,5.656725,-0.176695,-0.176802,-0.176701,-0.176541,-0.176642,-0.177098,-0.176915,-0.177037,-0.176782
2100617,-0.622903,-0.065244,1.472641,1.321193,-0.868793,-0.176825,-0.176844,-0.176751,-0.176551,-0.176907,...,-0.176781,-0.176695,-0.176802,-0.176701,-0.176541,-0.176642,-0.177098,-0.176915,-0.177037,-0.176782
2100618,0.863126,-0.776112,1.283426,1.321193,0.479384,-0.176825,-0.176844,-0.176751,-0.176551,-0.176907,...,-0.176781,-0.176695,-0.176802,-0.176701,-0.176541,-0.176642,5.646580,-0.176915,-0.177037,-0.176782
2100619,-0.622903,1.166015,0.766484,-0.001084,-1.542881,-0.176825,-0.176844,-0.176751,-0.176551,-0.176907,...,-0.176781,-0.176695,-0.176802,-0.176701,-0.176541,-0.176642,-0.177098,5.652423,-0.177037,-0.176782


In [34]:
arma_model = ARIMA(X_train_proc, freq='D', order=(6,0,1))

ValueError: Frequency provided without associated index.

In [6]:
df_test = pd.read_csv('data/test.csv')

In [7]:
df_test_proc = pd.DataFrame(
    pipeline_preprocessing.fit_transform(df_test), 
    columns=pipeline_preprocessing['transformer_preprocessing'].get_feature_names_out()
)

df_test_proc.head()

Unnamed: 0,preprocessing_tranformer__year,preprocessing_tranformer__sin_month,preprocessing_tranformer__cos_month,preprocessing_tranformer__sin_day_of_week,preprocessing_tranformer__cos_day_of_week,one_hot_encoder__family_AUTOMOTIVE,one_hot_encoder__family_BABY CARE,one_hot_encoder__family_BEAUTY,one_hot_encoder__family_BEVERAGES,one_hot_encoder__family_BOOKS,...,one_hot_encoder__family_MAGAZINES,one_hot_encoder__family_MEATS,one_hot_encoder__family_PERSONAL CARE,one_hot_encoder__family_PET SUPPLIES,one_hot_encoder__family_PLAYERS AND ELECTRONICS,one_hot_encoder__family_POULTRY,one_hot_encoder__family_PREPARED FOODS,one_hot_encoder__family_PRODUCE,one_hot_encoder__family_SCHOOL AND OFFICE SUPPLIES,one_hot_encoder__family_SEAFOOD
0,0.0,3.232969e-13,-3.330669e-16,1.254363,-0.699287,5.656854,-0.176777,-0.176777,-0.176777,-0.176777,...,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777
1,0.0,3.232969e-13,-3.330669e-16,1.254363,-0.699287,-0.176777,5.656854,-0.176777,-0.176777,-0.176777,...,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777
2,0.0,3.232969e-13,-3.330669e-16,1.254363,-0.699287,-0.176777,-0.176777,5.656854,-0.176777,-0.176777,...,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777
3,0.0,3.232969e-13,-3.330669e-16,1.254363,-0.699287,-0.176777,-0.176777,-0.176777,5.656854,-0.176777,...,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777
4,0.0,3.232969e-13,-3.330669e-16,1.254363,-0.699287,-0.176777,-0.176777,-0.176777,-0.176777,5.656854,...,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777,-0.176777
