In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_excel('/content/Airlines+Data.xlsx')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Renaming the column from month to date
df.rename({'Month':'Date'}, axis=1, inplace=True)

In [None]:
df.shape

In [None]:
# plotting Passenger data

plt.plot(df['Passengers'], color = 'red')

In [None]:
# extracting year and month from date column using pandas

df.head()
df['Year'] = df['Date'].dt.year
df["month"] = df.Date.dt.strftime("%b") # month extraction

In [None]:
df.head()

In [None]:
plt.figure(figsize = (12,10))
Pivot_data = pd.pivot_table(data = df,values = 'Passengers',index = 'Year', columns = 'month',aggfunc = 'mean')
sns.heatmap(Pivot_data,annot= True,cmap = 'plasma',fmt='g')
plt.show()

In [None]:

airline = df.copy()
airline.set_index('Date', inplace=True)
airline.index.year

In [None]:
plt.figure(figsize = (12,4))
plt.plot(airline['Passengers'],marker = "*", color = 'red')
plt.show()

In [None]:
# Year wise passanger increment

plt.figure(figsize = (14,6))
plt.xlabel("Year")
plt.ylabel("Passengers increment")
sns.lineplot(data=df,x = df['Year'],y = df['Passengers'],color = 'pink')
plt.show()

In [None]:
# Month wise passanger increment

plt.figure(figsize = (14,6))
plt.xlabel("Month")
plt.ylabel("Passengers increment")
sns.lineplot(data=df,x = df['month'],y = df['Passengers'],color = 'pink')
plt.show()

In [None]:
# detecting outliers

plt.figure(figsize = (14,8))
sns.boxplot(data = df,x = df['month'], y = df['Passengers'])
plt.show()

In [None]:

plt.figure(figsize = (14,8))
sns.boxplot(data = df,x = df['Year'], y = df['Passengers'])
plt.show()

In [None]:
# Distribution of passengers data

plt.figure(figsize = (8,6))
sns.distplot(df['Passengers'],color = 'purple')
plt.show()

In [None]:
from pandas.plotting import lag_plot

for i in [1,4,8,12,16,20,24,28,32,36,40]:
    plt.figure(figsize = (10,8))
    lag_plot(df['Passengers'],lag = i)
    plt.show()

In [None]:
plt.figure(figsize=(14,6))
df.Passengers.plot(label="org")
for i in range(4,13,4):
    df["Passengers"].rolling(i).mean().plot(label=str(i))
plt.legend(loc='best')
plt.show()

In [None]:
plt.figure(figsize=(14,4))
df.Passengers.plot(label="org")
plt.title("Original Series")
for i in range(2,13,2):
    plt.figure(figsize=(14,4))
    df["Passengers"].rolling(i).mean().plot(label=str(i))
    plt.title("Moving Average "+str(i))
    plt.legend(loc='best')
    plt.show()

In [None]:

from math import sqrt
import plotly.express as px
from scipy.stats import boxcox
import plotly.graph_objects as go
from pandas.plotting import lag_plot
import statsmodels.formula.api as smf
from plotly.subplots import make_subplots
from statsmodels.tsa.holtwinters import Holt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:
decompose_ts_add = seasonal_decompose(df.Passengers, period=12)
with plt.rc_context():
    plt.rc("figure", figsize=(12,10))
    decompose_ts_add.plot()
    plt.show()

In [None]:

import statsmodels.graphics.tsaplots as tsa_plots

with plt.rc_context():
    plt.rc("figure", figsize=(14,6))
    tsa_plots.plot_acf(df.Passengers, lags=90)
    plt.show()

In [None]:

import statsmodels.graphics.tsaplots as tsa_plots
with plt.rc_context():
    plt.rc("figure", figsize=(14,4))
    tsa_plots.plot_pacf(df.Passengers,lags=45)
    plt.show()

In [None]:
# Creating t variable column which is time period and its squared values as t_squared, log of Passengers column as log_passengers for diff models

In [None]:
airline2 = airline.copy()
airline2.reset_index(inplace=True)
airline2['t'] = np.arange(1,len(airline2)+1)
airline2['t_squared'] = np.square(airline2.t)
airline2["month"] = airline2.Date.dt.strftime("%b") # month extraction
airline2["year"] = airline2.Date.dt.strftime("%Y")  # year extraction
months = pd.get_dummies(airline2['month'])
months = months[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']]
airline2 = pd.concat([airline2,months],axis=1)
airline2['log_passengers'] = np.log(airline2['Passengers'])
airline2

In [None]:
# Dividing the data into training and testing

In [None]:
train = airline2.head(77)
test = airline2.tail(19)

In [None]:
# Creating models with smf ols method

In [None]:
#Linear Model
import statsmodels.formula.api as smf

linear_model = smf.ols('Passengers~t',data=train).fit()
pred_linear =  pd.Series(linear_model.predict(pd.DataFrame(test['t'])))
rmse_linear = np.sqrt(np.mean((np.array(test['Passengers'])-np.array(pred_linear))**2))
rmse_linear

In [None]:
#Exponential Model

Exp = smf.ols('log_passengers~t',data=train).fit()
pred_Exp = pd.Series(Exp.predict(pd.DataFrame(test['t'])))
rmse_Exp = np.sqrt(np.mean((np.array(test['Passengers'])-np.array(np.exp(pred_Exp)))**2))
rmse_Exp

In [None]:
#Quadratic

Quad = smf.ols('Passengers~t+t_squared',data=train).fit()
pred_Quad = pd.Series(Quad.predict(test[["t","t_squared"]]))
rmse_Quad = np.sqrt(np.mean((np.array(test['Passengers'])-np.array(pred_Quad))**2))
rmse_Quad

In [None]:

#Additive seasonality

add_sea = smf.ols('Passengers~Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov',data=train).fit()
pred_add_sea = pd.Series(add_sea.predict(test[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov']]))
rmse_add_sea = np.sqrt(np.mean((np.array(test['Passengers'])-np.array(pred_add_sea))**2))
rmse_add_sea

In [None]:

#Additive Seasonality Quadratic

add_sea_Quad = smf.ols('Passengers~t+t_squared+Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov',data=train).fit()
pred_add_sea_quad = pd.Series(add_sea_Quad.predict(test[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','t','t_squared']]))
rmse_add_sea_quad = np.sqrt(np.mean((np.array(test['Passengers'])-np.array(pred_add_sea_quad))**2))
rmse_add_sea_quad

In [None]:

##Multiplicative Seasonality

Mul_sea = smf.ols('log_passengers~Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov',data = train).fit()
pred_Mult_sea = pd.Series(Mul_sea.predict(test))
rmse_Mult_sea = np.sqrt(np.mean((np.array(test['Passengers'])-np.array(np.exp(pred_Mult_sea)))**2))
rmse_Mult_sea

In [None]:

#Multiplicative Additive Seasonality

Mul_Add_sea = smf.ols('log_passengers~t+Jan+Feb+Mar+Apr+May+Jun+Jul+Aug+Sep+Oct+Nov',data = train).fit()
pred_Mult_add_sea = pd.Series(Mul_Add_sea.predict(test))
rmse_Mult_add_sea = np.sqrt(np.mean((np.array(test['Passengers'])-np.array(np.exp(pred_Mult_add_sea)))**2))
rmse_Mult_add_sea

In [None]:

#Compare the results

data = {"MODEL":pd.Series(["rmse_linear","rmse_Exp","rmse_Quad","rmse_add_sea","rmse_add_sea_quad","rmse_Mult_sea","rmse_Mult_add_sea"]),"RMSE_Values":pd.Series([rmse_linear,rmse_Exp,rmse_Quad,rmse_add_sea,rmse_add_sea_quad,rmse_Mult_sea,rmse_Mult_add_sea])}
table_rmse=pd.DataFrame(data)
table_rmse.sort_values(['RMSE_Values'])

In [None]:

# Since we get lowest rmse value for the multiplicative additive seasonality model, we shall go with this model to make the predictions

In [None]:
pred_Mult_add_sea

In [None]:
np.exp(pred_Mult_add_sea)

In [None]:

plt.figure(figsize=(14,6))
plt.plot(train['Passengers'], label='Train')
plt.plot(test['Passengers'], label='Test')
plt.plot(np.exp(pred_Mult_add_sea), label='OLS Forecast')
plt.legend(loc='best')
plt.show()