# Table of Contents
### - Subset data and set date as index
### - Look for stationarity
### - Dickey-Fuller test
### - Autocorrelation chart
### - Differencing data

# Setting up Notebook

In [None]:
# Import relevant libraries
import quandl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import os
import warnings

In [None]:
# Set path for visualization export
path = r'C:\Users\mmreg\OneDrive\Desktop\Data Analytics Course Work\Data Immersion\Tasks\08-2022 Exploratory Analytics Project\04'

In [None]:
# Configure style and warnings
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

In [None]:
# Set API Key
quandl.ApiConfig.api_key = 'm5U1JC4r3TgxsTc9MFS4'

In [None]:
# Import Quandl variable
df = quandl.get('UMICH/SOC37')
df.head()

# Question 3
## Make a subset of your data set if necessary. Think about the historical domain of the series to decide if you need to (if there are irrelevant historical events in your data that would bias your analysis, for example).

In [None]:
# Will create subset with only the 'Good time to buy' column and date as the index
df_2 = df.reset_index()
df_2.head()

In [None]:
df_3 = df_2[['Date', 'Good time to Buy']]
df_3.head()

In [None]:
# Reset date as the index
from datetime import datetime

df_3['datetime'] = pd.to_datetime(df_3['Date'])
df_3 = df_3.set_index('datetime')
df_3.drop(['Date'], axis = 1, inplace = True)
df_3.head()

# Question 4
## Create a line chart of your data.

In [None]:
# Create line plot using matplotlib
car = plt.figure(figsize=(15,5), dpi=250)
plt.plot(df_3)

In [None]:
# Save visualization
car.figure.savefig(os.path.join(path, '04 Analysis', 'Visualization', 'car_insight.png'))

# Question 5
## Decompose the data’s components.

In [None]:
# Decompose chart
decomposition = sm.tsa.seasonal_decompose(df_3, model='additive')
from pylab import rcParams
rcParams['figure.figsize'] = 18, 7

In [None]:
decomposition.plot()
plt.show()

### The presence of a trendline and seasonality both indicate that this is non-stationary in nature. The "overall" trend of the data seems to be fairly neutral, possibly marking this as a week non-stationary trend. We will run more statistical analysis to clarify

# Question 6
## Conduct a Dickey-Fuller test to check for stationarity. Interpret the results in a markdown cell and reference your null hypothesis.

In [None]:
# Conduct Dickey-Fuller test
from statsmodels.tsa.stattools import adfuller # Import the adfuller() function

In [None]:
def dickey_fuller(timeseries): # Define the function
    # Perform the Dickey-Fuller test:
    print ('Dickey-Fuller Stationarity test:')
    test = adfuller(timeseries, autolag='AIC')
    result = pd.Series(test[0:4], index=['Test Statistic','p-value','Number of Lags Used','Number of Observations Used'])
    for key,value in test[4].items():
       result['Critical Value (%s)'%key] = value
    print (result)

In [None]:
dickey_fuller(df_3)

### Null hypothesis: If the test statistic is larger than the critical value at 5%, we cannot reject the null hypothesis. If it is smaller than the critical value, we can reject it. Based on the DF test, we can see that the test statistic is smaller than the 5% critical value. This means we cannot reject the null hypothesis and that the data is non-stationary. We will confirm this by running a plot of autocorrleation

In [None]:
# Create autocorrelation plot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(df_3)
plt.show()

### There are 22 bars that fall outside of the confidence interval, way more than 9 which is the rule of thumb in the industry. We will need to make this data stationary.

# Question 7
## If your data isn’t stationary, conduct a round of differencing on it and repeat step 6.

In [None]:
# Difference the data
df_dif = df_3 - df_3.shift(1)

In [None]:
# Drop NaN values that resulted from differencing
df_dif.dropna(inplace = True) 

In [None]:
# Check to ensure differencing was completed
df_dif.head()

In [None]:
# Plot out the differenced dataset and run Dickey-Fuller test
plt.figure(figsize=(15,5), dpi=250)
plt.plot(df_dif)

In [None]:
# Much more stationary looking. Use statistics to confirm
def dickey_fuller(timeseries): # Define the function
    # Perform the Dickey-Fuller test:
    print ('Dickey-Fuller Stationarity test:')
    test = adfuller(timeseries, autolag='AIC')
    result = pd.Series(test[0:4], index=['Test Statistic','p-value','Number of Lags Used','Number of Observations Used'])
    for key,value in test[4].items():
       result['Critical Value (%s)'%key] = value
    print (result)

In [None]:
dickey_fuller(df_dif)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(df_dif)
plt.show()

### Even though the autocorrelation chart shows it is ok, the DF test says otherwise. I will run another round of stationarizing

# Question 8
## If your data still isn’t stationary, perform another round of differencing and repeat step 6.

In [None]:
df_dif_2 = df_dif - df_dif.shift(1)
df_dif_2.dropna(inplace = True)

In [None]:
df_dif_2.head()

In [None]:
# Plot newly differenced data
plt.figure(figsize=(15,5), dpi=250)
plt.plot(df_dif_2)

In [None]:
def dickey_fuller(timeseries): # Define the function
    # Perform the Dickey-Fuller test:
    print ('Dickey-Fuller Stationarity test:')
    test = adfuller(timeseries, autolag='AIC')
    result = pd.Series(test[0:4], index=['Test Statistic','p-value','Number of Lags Used','Number of Observations Used'])
    for key,value in test[4].items():
       result['Critical Value (%s)'%key] = value
    print (result)

In [None]:
dickey_fuller(df_dif_2)

In [None]:
plot_acf(df_dif_2)
plt.show()