In [1]:
import pycaret as pc
import pandas as pd

In [2]:
sell_data = pd.read_excel('./HistoryOfSell_Rev02.xlsx', sheet_name='Sell')

sell_data.head()

Unnamed: 0,Date,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,A15,A16,A17,A18,A19,A20,A21,A22,A23,A24
0,2014-03-01,2,2,0,1,1,0,2,0,1,...,0,0,1,0,1,0,1,1,0,0
1,2014-04-01,6,4,0,2,4,0,1,1,0,...,0,1,0,2,6,3,5,5,2,2
2,2014-05-01,9,3,0,3,6,0,0,1,0,...,0,0,2,0,4,1,3,1,0,0
3,2014-06-01,8,3,2,6,6,0,2,2,4,...,0,0,2,0,6,2,6,3,0,0
4,2014-07-01,11,4,0,6,4,0,2,3,3,...,0,0,2,0,9,0,6,5,0,0


In [3]:
sell_data['A3'].describe()

count    111.000000
mean       2.567568
std        2.258807
min        0.000000
25%        1.000000
50%        2.000000
75%        3.500000
max       14.000000
Name: A3, dtype: float64

In [4]:
import plotly.express as px


sell_data['A3_Mean'] = sell_data['A2'].rolling(12).mean()

fig = px.line(sell_data, x='Date', y=['A3', 'A3_Mean'], template='plotly_dark')
fig.show()

# A3

In [5]:
import numpy as np

# extract month and year from dates
sell_data['Month'] = [i.month for i in sell_data['Date']]
sell_data['Year'] = [i.year for i in sell_data['Date']]

# create a sequence of numbers
sell_data['Series'] = np.arange(1,len(sell_data)+1)

# drop unnecessary columns and re-arrange

sell_data.drop(['Date', 'A3_Mean'], axis=1, inplace=True)
sell_data = sell_data[['Series', 'Year', 'Month', 'A3']] 

# check the head of the dataset
sell_data.head()

Unnamed: 0,Series,Year,Month,A3
0,1,2014,3,0
1,2,2014,4,0
2,3,2014,5,0
3,4,2014,6,2
4,5,2014,7,0


In [6]:
# split data into train-test set
train = sell_data[sell_data['Year'] < 2022]
test = sell_data[sell_data['Year'] >= 2022]

# check shape
train.shape, test.shape


((94, 4), (17, 4))

In [7]:
# import the regression module
from pycaret.regression import *

# initialize setup
s = setup(data = train, test_data = test, target = 'A3', fold_strategy = 'timeseries', numeric_features = ['Year', 'Series'], fold = 20, transform_target = True, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,A3
2,Target type,Regression
3,Original data shape,"(111, 4)"
4,Transformed data shape,"(111, 4)"
5,Transformed train set shape,"(94, 4)"
6,Transformed test set shape,"(17, 4)"
7,Numeric features,2
8,Preprocess,1
9,Imputation type,simple


In [8]:
best = compare_models(sort = 'MSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,1.5949,5.6626,2.0858,-0.6578,0.5919,0.4749,0.135
ada,AdaBoost Regressor,1.5871,5.8123,2.0862,-1.0594,0.5858,0.4662,0.238
lasso,Lasso Regression,1.5951,5.9052,2.1,-0.5184,0.5909,0.4897,0.137
en,Elastic Net,1.6703,6.111,2.1528,-0.6263,0.6113,0.5614,0.138
dummy,Dummy Regressor,1.5791,6.1156,2.1225,-0.4723,0.5813,0.4474,0.1775
llar,Lasso Least Angle Regression,1.5791,6.1156,2.1225,-0.4723,0.5813,0.4474,0.141
lightgbm,Light Gradient Boosting Machine,1.6482,6.1811,2.1637,-0.7624,0.5916,0.5192,0.2195
br,Bayesian Ridge,1.6909,6.5388,2.1988,-0.6279,0.6119,0.5648,0.1515
rf,Random Forest Regressor,1.7381,6.6471,2.2136,-2.8178,0.6199,0.601,0.3045
et,Extra Trees Regressor,1.8587,7.1281,2.355,-2.08,0.6686,0.6778,0.299


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [9]:
prediction_holdout = predict_model(best);

In [10]:
# generate predictions on the original dataset**
predictions = predict_model(best, data=sell_data)

# add a date column in the dataset**
predictions['Date'] = pd.date_range(start='2014-03-01', end = '2023-05-01', freq = 'MS')

# line plot**
fig = px.line(predictions, x='Date', y=["A3", "prediction_label"], template = 'plotly_dark')

# add a vertical rectange for test-set separation**
fig.add_vrect(x0="2022-07-01", x1="2023-06-01", fillcolor="grey", opacity=0.25, line_width=0)

fig.show()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,K Neighbors Regressor,1.511,4.8208,2.1956,0.0466,0.5702,0.486


In [11]:
predictions

Unnamed: 0,Series,Year,Month,A3,prediction_label,Date
0,1,2014,3,0.000000,0.256472,2014-03-01
1,2,2014,4,0.000000,0.256472,2014-04-01
2,3,2014,5,0.000000,0.256472,2014-05-01
3,4,2014,6,1.152985,0.444188,2014-06-01
4,5,2014,7,0.000000,0.657177,2014-07-01
...,...,...,...,...,...,...
106,107,2023,1,0.714534,1.769477,2023-01-01
107,108,2023,2,1.152985,1.769477,2023-02-01
108,109,2023,3,0.714534,1.769477,2023-03-01
109,110,2023,4,0.000000,1.769477,2023-04-01


In [12]:
final_best = finalize_model(best)

In [13]:
future_dates = pd.date_range(start = '2023-06-01', end = '2024-05-01', freq = 'MS')

future_df = pd.DataFrame()

future_df['Month'] = [i.month for i in future_dates]
future_df['Year'] = [i.year for i in future_dates]    
future_df['Series'] = np.arange(112,(112+len(future_dates)))

future_df.head()

Unnamed: 0,Month,Year,Series
0,6,2023,112
1,7,2023,113
2,8,2023,114
3,9,2023,115
4,10,2023,116


In [14]:
predictions_future = predict_model(final_best, data=future_df)
predictions_future.head()

Unnamed: 0,Month,Year,Series,prediction_label
0,6,2023,112,1.486762
1,7,2023,113,1.486762
2,8,2023,114,1.859492
3,9,2023,115,1.859492
4,10,2023,116,1.318885


In [15]:
concat_df = pd.concat([sell_data, predictions_future], axis=0)
concat_df_i = pd.date_range(start='2014-03-01', end = '2024-05-01', freq = 'MS')
concat_df.set_index(concat_df_i, inplace=True)

fig = px.line(concat_df, x=concat_df.index, y=["A3", "prediction_label"], template = 'plotly_dark')
fig.show()