In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv('df.csv',parse_dates=True,index_col='date')
df = pd.DataFrame(df.groupby(df.index.strftime('%Y-%m')).sum()['amount'])
df.columns=['Value']
df

Unnamed: 0_level_0,Value
date,Unnamed: 1_level_1
1993-01,702157.6
1993-02,2726925.3
1993-03,4730318.6
1993-04,7378367.8
1993-05,11680753.4
...,...
1998-08,150639332.7
1998-09,148297693.4
1998-10,151889623.6
1998-11,148842093.6


<h1> Part 1 : Differencing

In [2]:
def adf_check(time_series):
    """
    Pass in a time series, returns ADF report
    """
    result = adfuller(time_series)
    print (result)
    print('Augmented Dickey-Fuller Test:')
    labels = ['ADF Test Statistic','p-value','Number of Lags Used','Number of Observations Used', "Critical Values"]

    for value,label in zip(result,labels):
        print(label+' : '+str(value) )
    
    if result[1] < min([result[4]["1%"],result[4]["5%"],result[4]["10%"]]):
        print("strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary")
    else:
        print("weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary \n")

In [3]:
def I(df,order1,order2):
    df_testing = pd.DataFrame(np.log(df.Value).diff().diff(12))
    adf_check(df_testing.Value.dropna())

(-5.225226609713747, 7.80717692014732e-06, 3, 55, {'1%': -3.5552728880540942, '5%': -2.9157312396694217, '10%': -2.5956695041322315}, -195.51743097379642)
Augmented Dickey-Fuller Test:
ADF Test Statistic : -5.225226609713747
p-value : 7.80717692014732e-06
Number of Lags Used : 3
Number of Observations Used : 55
Critical Values : {'1%': -3.5552728880540942, '5%': -2.9157312396694217, '10%': -2.5956695041322315}
weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary 



(-5.225226609713747, 7.80717692014732e-06, 3, 55, {'1%': -3.5552728880540942, '5%': -2.9157312396694217, '10%': -2.5956695041322315}, -195.51743097379642)
Augmented Dickey-Fuller Test:
ADF Test Statistic : -5.225226609713747
p-value : 7.80717692014732e-06
Number of Lags Used : 3
Number of Observations Used : 55
Critical Values : {'1%': -3.5552728880540942, '5%': -2.9157312396694217, '10%': -2.5956695041322315}
weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary 



<br></br>
<h1>Part 2: AR Auto-Regressive

In [4]:
def AR(p,df):
    df_temp=df.copy()
    
    for i in range(1,p+1):
        df_temp[f'Value_Lag {i}']=df_temp["Value"].shift(i)
    
    df_temp=df_temp.dropna()
    train_size= (int)(.8*df_temp.shape[0])
    train=df_temp[0:train_size]
    x=train.iloc[:,1:]
    y=train.iloc[:,0]
    
    test=df_temp[train_size::]
    y_t=test.iloc[:,0]
    x_t=test.iloc[:,1:]
    lr=LinearRegression()
    lr.fit(x,y)
    theta=lr.coef_
    intercept=lr.intercept_
    train['Predicted_Values'] = x.dot(lr.coef_.T) + lr.intercept_
    test['Predicted_Values']=x_t.dot(theta)+intercept
  
    RMSE=np.sqrt(mean_squared_error(test['Value'],test['Predicted_Values']))
    print(type(train["Predicted_Values"]))
    residuals=pd.DataFrame(df_temp['Value']-pd.concat([train['Predicted_Values'],test['Predicted_Values']]),columns=['Residuals'])
    print(RMSE)
    return [train,test,RMSE,residuals]


<class 'pandas.core.series.Series'>
11159273.635675408


10

<h1>Part 3 - MA : Moving Average

In [5]:
def MA(q,res):#residuals
    res_temp=res.copy()
    
    for i in range(1,q+1):
        res_temp[f'Residual_Lag {i}']=res_temp["Value"].shift(i)
    
    res_temp=res_temp.dropna()
    train_size= (int)(.8*res_temp.shape[0])
    train=res_temp[0:train_size]
    x=train.iloc[:,1:]
    y=train.iloc[:,0]
    
    test=res_temp[train_size::]
    y_t=test.iloc[:,0]
    x_t=test.iloc[:,1:]
    lr=LinearRegression()
    lr.fit(x,y)
    theta=lr.coef_
    intercept=lr.intercept_
    train['Predicted_Values'] = x.dot(lr.coef_.T) + lr.intercept_
    test['Predicted_Values']=x_t.dot(theta)+intercept
    res_temp['Residue']=pd.concat([train['Predicted_Values'],test["Predicted_Values"]])
    RMSE=np.sqrt(mean_squared_error(test['Value'],test['Predicted_Values']))
    
    print(RMSE)
    return


11159273.635675408


10