In [None]:
!unzip /content/rossmann-store-sales.zip

<h2>Importing all the important libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.metrics import r2_score
from tqdm import tqdm
import datetime
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.sarimax import SARIMAXResults

<h2>Loading the data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
store = pd.read_csv('store.csv')

In [None]:
train.info()

In [None]:
store.info()

In [None]:
train.head()

In [None]:
store.head()

In [None]:
store_1 = train[train['Store']==1] #Time series analysis for only one store

In [None]:
store_1 = store_1.iloc[::-1] # Reversing the data, sorting with the dates in ascending order

In [None]:
store_1['day_number'] = np.arange(store_1['Store'].shape[0]) + 1 
store_1

In [None]:
#getting x and y
X = store_1[['day_number']]
y = store_1['Sales']

In [None]:
def plot_trend(X,y):
  '''
  For plotting a dependent variable
  '''
  plt.figure(figsize=(20,5))
  plt.scatter(X,y,color='darkblue')
  plt.xlabel('per day',fontsize=20)
  plt.ylabel('Sales',fontsize=20)
  plt.title('Sales over days',fontsize=20)
  plt.grid(True)
  plt.show()

In [None]:
plot_trend(X,y)

<h2>Checking for the Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X,y)
lr_pred = lr.predict(X)

plt.figure(figsize=(23,5))
plt.scatter(X,y,color='darkblue')
plt.plot(X,lr_pred,color='red')
plt.legend(['Predicted','True'])
plt.xlabel('per day',fontsize=20)
plt.ylabel('Sales',fontsize=20)
plt.title('Lr predicted vs truth',fontsize=20)
plt.grid(True)
plt.show()

In [None]:
'''
It is clear that when sales are 0 it should not be a Sunday. 

'''

sale_when_store_was_closed = store_1[store_1['DayOfWeek']==7][store_1['Sales']==0].shape[0]
print(
      f'''Sales has to be 0 when the day is sunday, 
          so total rows with 0 sales and week day 7 : 
          {sale_when_store_was_closed}
      '''
    )

*Removing the data with week day 7.*

In [None]:
#Removing all the rows with sales=0 and DayOfWeek=7(sunday)
store_1 = store_1[store_1['DayOfWeek']!=7]
store_1.shape

In [None]:
#This function compares the original and predicted values.
def compare_with_truth(X,y,y_pred,degree):
  plt.figure(figsize=(23,5))
  plt.scatter(X,y,color='darkblue')
  plt.plot(X,y_pred,color='red')
  plt.legend(['Predicted','True'])
  plt.xlabel('per day',fontsize=20)
  plt.ylabel('Sales',fontsize=20)
  plt.title(f'Degree = {degree}',fontsize=20)
  plt.grid(True)
  plt.show()

<h1>Experimenting with lag

In [None]:
lag_1 = X.shift(1)

In [None]:
n = 11
poly_lr = PolynomialFeatures(degree=n)
X_poly = poly_lr.fit_transform(X)
X_poly.shape

In [None]:
#Checking the dependence of multiorder of the past values
for i in range(1,12):
  lr = LinearRegression()
  lr.fit(X_poly[:,i].reshape(-1,1),y)
  lr_pred = lr.predict(X_poly[:,i].reshape(-1,1))
  r_square = r2_score(lr_pred,y)
  compare_with_truth(X_poly[:,i],y,lr_pred,f' = {i} with R2 score : {r_square}')

In [None]:
#Checking for the lag dependency
lr = LinearRegression()
x = np.array(y.shift(1).dropna()).reshape(-1,1)
lr.fit(x,y[1:])
lr_pred = lr.predict(x)
r_square = r2_score(lr_pred,y[1:])
compare_with_truth(y.shift(1)[1:],y[1:],lr_pred,f'lag = {1} with R2 score : {r_square}')

In [None]:
def getting_the_best_window(x,y_x,y_pred,n):

  '''
  
  Moving average could be an important feature but the degree has
  to be decided. So checking the degree of moving average.
  
  '''
  r_square = r2_score(y_pred,y_x)
  title = f'Moving average with window = {n} and with R2 score : {r_square}'
  plt.figure(figsize=(23,5))
  plt.scatter(x,y_x,color='darkblue')
  plt.plot(x,MA,color='red')
  plt.legend(['Predicted','True'])
  plt.xlabel('per day',fontsize=20)
  plt.ylabel('Sales',fontsize=20)
  plt.title(title,fontsize=20)
  plt.grid(True)
  plt.show()

In [None]:
for n in range(3,10):
  x = store_1['day_number'].iloc[n-1:]
  y_x = store_1['Sales']
  MA = y_x.rolling(window=n).mean()
  MA = MA.iloc[n-1:]
  getting_the_best_window(x,y_x[n-1:],MA,n)

<h2>Model Training

In [None]:
#Training the SARIMA model
p = [1] #AR value : the degree of lag
d = [1] #Integration : degree for calculating the past differences.
g = [3] #Moving average window
S = [12] # 52: weekly, 12:Monthly, 4:quarterly, 3:Yearly
t_start = datetime.datetime.now()

metrics = {}
x_tr = store_1['Sales'].iloc[0:700]
x_test = store_1['Sales'].iloc[700:800]
for P in tqdm(p,colour='#db7501'):
  for D in d:
    for G in g:
      for s in S:
        
        s_mod = SARIMAX(
                        x_tr, 
                        order=(P,D,G), 
                        seasonal_order=(P,D,G,s),
                        enforce_stationarity=False,
                        enforce_invertibility=False
                        )
        s_fit = s_mod.fit(disp=0)
        predictions = s_fit.predict(start=700, end=799, exog=None, dynamic=False)

        r_square = r2_score(x_test,predictions)
        metrics[f'({P},{D},{G},{s})'] = r_square
        print(f'params : ({P},{D},{G},{s}), score : {r_square}')
t_end = datetime.datetime.now()

print(f'Total time taken for training : {t_end - t_start}')

<h2>Model Testing

In [None]:
title = f'Moving average with window = {n} and with R2 score : {r_square}'
plt.figure(figsize=(23,5))
plt.plot(store_1['day_number'].iloc[700:800],x_test,color='darkblue')
plt.scatter(store_1['day_number'].iloc[700:800],x_test,color='blue',s=100)
plt.plot(store_1['day_number'].iloc[700:800],predictions,color='red')
plt.legend(['Predicted','True'])
plt.xlabel('per day',fontsize=20)
plt.ylabel('Sales',fontsize=20)
plt.title(title,fontsize=20)
plt.grid(True)
plt.show()

<h2>Saving the model

In [None]:
store_1['Sales'].to_csv('sales.csv',index=False)

In [None]:
s_fit.save('model.pkl')