In [2]:
import pandas as pd 
dataset = pd.read_csv('catfish_sales_1986_2001.csv', parse_dates= [0])
dataset.head()

#print the dataset table

Unnamed: 0,Date,Total
0,1986-01-01,9034
1,1986-02-01,9596
2,1986-03-01,10558
3,1986-04-01,9002
4,1986-05-01,9239


In [7]:
#line chart
import plotly.express as px
mask = (dataset['Date'] <= '2000-01-01')
dataset = dataset.loc[mask]

fig = px.line(dataset, x ='Date', y='Total', title='catfish_sales_1986_2001')
fig.show()

#print the dataset in a graph 


In [9]:
#box plot 
# DOUBLE check if there any anomalies
import plotly.express as px 

fig = px.box(dataset, x='Date', y= 'Total', title= "catfish_sales_1986_2001")
fig.show()


In [14]:
#re indexing dataset
datetime_series = pd.to_datetime(dataset['Date'])
datetime_index = pd.DatetimeIndex(datetime_series.values)
period_index = pd.PeriodIndex(datetime_index, freq = 'M')
dataset = dataset.set_index(period_index)
dataset.drop('Date', axis=1, inplace = True)
dataset.head()

Unnamed: 0,Total
1986-01,9034
1986-02,9596
1986-03,10558
1986-04,9002
1986-05,9239


In [30]:
import plotly.graph_objects as go 
from sktime.forecasting.model_selection import temporal_train_test_split
y_train, y_test = temporal_train_test_split(dataset, test_size = 12)
fig = go.Figure()
fig.add_trace(go.Scatter(name = "Train DataSet", x=y_train.index.astype(str), y=y_train["Total"]))
fig.add_trace(go.Scatter(name = "Test DataSet", x=y_test.index.astype(str), y=y_test['Total']))
fig.update_layout(title = "Splitted dataset")

fig.show()

In [32]:
#price prediction using SARIMAX model
from statsmodels.tsa.statespace.sarimax import SARIMAX
model = SARIMAX(y_train['Total'], order= (1,1,1), seasonal_order = (1,0,1,12))
model_fit = model.fit()
y_pred = model_fit.predict(start = len(y_train), end= len(y_train)+11, exog = None, dynamic= True)


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.



In [37]:
fig= go.Figure()
fig.add_trace(go.Scatter(name = "Train Dataset", x=y_train.index.astype(str), y=y_train['Total']))
fig.add_trace(go.Scatter(name= "Test Dataset", x=y_test.index.astype(str), y=y_test["Total"]))
fig.add_trace(go.Scatter(name = "Prediction", x=y_pred.index.astype(str), y=y_pred.values))
fig.update_layout(title="Predicted vs actual values")

fig.show()

In [39]:
 #Error estimation
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
mae = mean_absolute_error(list(y_test['Total']), list(y_pred))
mape = mean_absolute_percentage_error(list(y_test['Total']), list(y_pred))
print('MAE:%.3f' %mae)
print('MAPE: %.3f' %mape)

MAE:1010.310
MAPE: 0.039


In [45]:
#Breaking Dataset
from datetime import datetime
#cloning good dataset
broken_dataset = dataset.copy()
# breaking clonned dataset with random anomaly
broken_dataset.loc[datetime(1998,12,1),['Total']]= 1000


#Data visualization
import plotly.express as px
#Plotting DataFrame 
fig = px.line(
    broken_dataset,
    x= broken_dataset.index.astype(str),
    y= broken_dataset['Total']
)
fig.update_layout(
    yaxis_title = 'Total',
    xaxis_title = 'Date',
    title = 'Catfish sales 1986-2000 (broken)'
)
fig.show()


In [47]:
#box plot to see the outlier

import plotly.express as px
fig = px.box(broken_dataset, y="Total")
fig.update_layout(title = 'Catfish sales 1986-2000 (broken)')
fig.show()

In [49]:
#splotting dataset
import plotly.graph_objects as go
from sktime.forecasting.model_selection import temporal_train_test_split
#splitting dataset (test dataset size is last 12 periods/months)
y_train, y_test = temporal_train_test_split(broken_dataset, test_size=12)
#visualizing train/test dataset
fig = go.Figure()
fig.add_trace(go.Scatter( name= "Train Dataset", x=y_train.index.astype(str), y=y_train["Total"]))
fig.add_trace(go.Scatter(name = "Test Dataset", x=y_test.index.astype(str), y= y_test['Total']))
fig.update_layout(title = "Splitted dataset")
fig.show()

In [51]:
#SARIMA ALGORITHM
#price predicting 
from statsmodels.tsa.statespace.sarimax import SARIMAX
model = SARIMAX(y_train["Total"],order = (1,1,1), seasonal_order=(1,0,1,12))
model_fit= model.fit()
y_pred = model_fit.predict(start = len(y_train), end = len(y_train)+11,exog = None, dynamic= True) 


Non-stationary starting seasonal autoregressive Using zeros as starting parameters.



In [63]:
#visualization 
import plotly.graph_objects as go 
fig = go.Figure ()
fig.add_trace(go.Scatter(name = "Train Dataset", x= y_train.index.astype(str), y= y_train['Total']))
fig.add_trace(go.Scatter(name = "Test Dataset", x= y_test.index.astype(str), y= y_test['Total']))
fig.add_trace(go.Scatter(name = "Prediction", x= y_pred.index.astype(str), y= y_pred.values))

fig.update_layout(
    yaxis_title = 'Total',
    xaxis_title = 'Date',
    title = 'Catfish sales 1986-2000 incorrect predictions'
    )
fig.show()


In [2]:
#error estimation 
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
mea = mean_absolute_error(list(y_test['Total']), list(y_pred))
mape = mean_absolute_percentage_error(list(y_test['Total']), list(y_pred))
print('MAE: %.3f' %mae)
print('MAPE: %.3f' %mape)


MAE: 1010.310
MAPE: 0.345
