## Importation

In [3]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Feature engineering
from sklearn.preprocessing import StandardScaler

# Machine learning models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  
from xgboost import XGBRegressor  

# Model evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Time series-specific models
from statsmodels.tsa.statespace.sarimax import SARIMAX


# Other utilities
from datetime import datetime
import os


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')



In [12]:
data = pd.read_csv('../Assets/train.csv')

### Data Understanding

In [13]:
data

Unnamed: 0,date,store_id,category_id,target,onpromotion,nbr_of_transactions
0,365,store_1,category_24,0.000,0,0.0
1,365,store_1,category_21,0.000,0,0.0
2,365,store_1,category_32,0.000,0,0.0
3,365,store_1,category_18,0.000,0,0.0
4,365,store_1,category_26,0.000,0,0.0
...,...,...,...,...,...,...
2248879,1626,store_9,category_23,539.725,0,2141.0
2248880,1626,store_9,category_20,84.177,0,2141.0
2248881,1626,store_9,category_15,1973.760,3,2141.0
2248882,1626,store_9,category_29,2.000,0,2141.0


In [14]:
data.shape

(2248884, 6)

In [15]:
data.info

<bound method DataFrame.info of          date store_id  category_id    target  onpromotion  \
0         365  store_1  category_24     0.000            0   
1         365  store_1  category_21     0.000            0   
2         365  store_1  category_32     0.000            0   
3         365  store_1  category_18     0.000            0   
4         365  store_1  category_26     0.000            0   
...       ...      ...          ...       ...          ...   
2248879  1626  store_9  category_23   539.725            0   
2248880  1626  store_9  category_20    84.177            0   
2248881  1626  store_9  category_15  1973.760            3   
2248882  1626  store_9  category_29     2.000            0   
2248883  1626  store_9  category_10    27.076            0   

         nbr_of_transactions  
0                        0.0  
1                        0.0  
2                        0.0  
3                        0.0  
4                        0.0  
...                      ...  
224887

In [16]:
data.isna().sum()

date                   0
store_id               0
category_id            0
target                 0
onpromotion            0
nbr_of_transactions    0
dtype: int64

In [17]:
print(f'Columns Names: {list(data.columns)}')

Columns Names: ['date', 'store_id', 'category_id', 'target', 'onpromotion', 'nbr_of_transactions']


In [18]:
test = pd.read_csv('../Assets/test.csv')

In [19]:
test

Unnamed: 0,date,store_id,category_id,onpromotion
0,1627,store_1,category_24,0
1,1627,store_1,category_21,0
2,1627,store_1,category_32,0
3,1627,store_1,category_18,16
4,1627,store_1,category_26,0
...,...,...,...,...
99787,1682,store_9,category_23,0
99788,1682,store_9,category_20,1
99789,1682,store_9,category_15,7
99790,1682,store_9,category_29,8


In [31]:
test.isna().sum()

date           0
store_id       0
category_id    0
onpromotion    0
dtype: int64

In [21]:
str = pd.read_csv('../Assets/stores.csv')

In [22]:
str.head()

Unnamed: 0,store_id,city,type,cluster
0,store_1,0,0,0
1,store_2,0,0,0
2,store_3,0,0,1
3,store_4,0,0,2
4,store_5,1,0,3


In [23]:
str.info

<bound method DataFrame.info of     store_id  city  type  cluster
0    store_1     0     0        0
1    store_2     0     0        0
2    store_3     0     0        1
3    store_4     0     0        2
4    store_5     1     0        3
5    store_6     0     0        0
6    store_7     0     0        1
7    store_8     0     0        1
8    store_9     0     1        4
9   store_10     0     2        5
10  store_11     2     1        4
11  store_12     3     2        5
12  store_13     3     2        5
13  store_14     4     2        6
14  store_15     5     2        5
15  store_16     1     2        7
16  store_17     0     2        8
17  store_18     0     1        9
18  store_19     6     2        5
19  store_20     0     1        4
20  store_21     1     1        4
21  store_22     7     2        6
22  store_23     8     0        2
23  store_24     9     0       10
24  store_25    10     0       10
25  store_26     9     0       11
26  store_27    11     0       10
27  store_28    

In [25]:
print(f'Columns Names: {list(str.columns)}')

Columns Names: ['store_id', 'city', 'type', 'cluster']


In [27]:
dates = pd.read_csv('../Assets/dates.csv')

In [29]:
dates

Unnamed: 0,date,year,month,dayofmonth,dayofweek,dayofyear,weekofyear,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,year_weekofyear
0,365,1,1,1,2,1,1,1,True,False,True,False,True,False,101
1,366,1,1,2,3,2,1,1,False,False,False,False,False,False,101
2,367,1,1,3,4,3,1,1,False,False,False,False,False,False,101
3,368,1,1,4,5,4,1,1,False,False,False,False,False,False,101
4,369,1,1,5,6,5,1,1,False,False,False,False,False,False,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,1680,4,8,11,4,223,32,3,False,False,False,False,False,False,432
1316,1681,4,8,12,5,224,32,3,False,False,False,False,False,False,432
1317,1682,4,8,13,6,225,32,3,False,False,False,False,False,False,432
1318,1683,4,8,14,0,226,33,3,False,False,False,False,False,False,433


In [30]:
print(f'Columns Names: {list(dates.columns)}')

Columns Names: ['date', 'year', 'month', 'dayofmonth', 'dayofweek', 'dayofyear', 'weekofyear', 'quarter', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end', 'year_weekofyear']


In [39]:
dates.isna().sum()

date                0
year                0
month               0
dayofmonth          0
dayofweek           0
dayofyear           0
weekofyear          0
quarter             0
is_month_start      0
is_month_end        0
is_quarter_start    0
is_quarter_end      0
is_year_start       0
is_year_end         0
year_weekofyear     0
dtype: int64

In [33]:
holiday= pd.read_csv('../Assets/holidays.csv')

In [35]:
holiday.head()

Unnamed: 0,date,type
0,1,0
1,5,4
2,12,4
3,42,0
4,43,0


In [36]:
print(f'Columns Names: {list(holiday.columns)}')

Columns Names: ['date', 'type']


In [38]:
holiday.isna().sum()

date    0
type    0
dtype: int64