# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint

##Not for Grading

## Comparison of various methods in Timeseries Analysis

In [1]:
#@title Case Study Walkthrough
#@markdown  Comparison of various methods in Timeseries Analysis
from IPython.display import HTML

HTML("""<video width="320" height="240" controls>
  <source src="https://cdn.talentsprint.com/talentsprint/archives/sc/aiml/aiml_2018_b7_hyd/preview_videos/time_series_analysis.mp4">
</video>
""")

## Time Series Models

In this experiment, we will build different  time-series forecasting models to get a forecast for Onion prices. 

Basic models: 

1. Mean Constant Model
2. Linear Trend Model
3. Random Walk Model

Advanced models:

1.   Simple Exponential Smoothing Model
2.   ARIMA Model
3.   MLP Model



#### Setup Steps

## Imports: All the imports are defined here

We are downgrading the packages and installing the packages as defined below to perform some functionalities.

pandas = 0.22.0

numpy = 1.14.3


* When you run the below code for uninstalling and downgrading the pandas and numpy version, you will get a button "Restart Runtime" below. 

* Just click on the "Restart Runtime" and select "Yes" when it prompts. This Restart runtime will only reset the pandas and numpy package. 

  **NOTE:** DONOT GO TO RUNTIME  -> RESTART RUNTIME. This will restart all packages then you need to repeat all the steps from begining.

* Simply continue with the next code cell 



In [None]:
!pip install numpy==1.14.3 --upgrade
!pip install pandas==0.22.0

In [13]:
#@title Please enter your registration id to start: (e.g. P181900101) { run: "auto", display-mode: "form" }
Id = "2100121" #@param {type:"string"}


In [14]:
#@title Please enter your password (normally your phone number) to continue: { run: "auto", display-mode: "form" }
password = "5142192291" #@param {type:"string"}


In [15]:
#@title Run this cell to complete the setup for this Notebook
from IPython import get_ipython

ipython = get_ipython()
  
notebook= "U3W16_CS_Timeseries_Analysis" #name of the notebook
Answer = "Ungraded"
def setup():
    ipython.magic("sx wget https://cdn.talentsprint.com/aiml/Casestudies_slides/Week6/MonthWiseMarketArrivals_Clean.csv")
    ipython.magic("sx wget https://cdn.talentsprint.com/aiml/Casestudies_slides/Week6/ts_mlpr.joblib")
    ipython.magic("sx pip3 install statsmodels")
    ipython.magic("sx pip install scipy==1.2 --upgrade")
    from IPython.display import HTML, display
    display(HTML('<script src="https://dashboard.talentsprint.com/aiml/record_ip.html?traineeId={0}&recordId={1}"></script>'.format(getId(),submission_id)))
    print("Setup completed successfully")
    return

def submit_notebook():
    
    ipython.magic("notebook -e "+ notebook + ".ipynb")
    
    import requests, json, base64, datetime

    url = "https://dashboard.talentsprint.com/xp/app/save_notebook_attempts"
    if not submission_id:
      data = {"id" : getId(), "notebook" : notebook, "mobile" : getPassword()}
      r = requests.post(url, data = data)
      r = json.loads(r.text)

      if r["status"] == "Success":
          return r["record_id"]
      elif "err" in r:        
        print(r["err"])
        return None        
      else:
        print ("Something is wrong, the notebook will not be submitted for grading")
        return None

    elif getAnswer() and getComplexity() and getAdditional() and getConcepts():
      f = open(notebook + ".ipynb", "rb")
      file_hash = base64.b64encode(f.read())

      data = {"complexity" : Complexity, "additional" :Additional, 
              "concepts" : Concepts, "record_id" : submission_id, 
              "answer" : Answer, "id" : Id, "file_hash" : file_hash,
              "feedback_experiments_input" : Comments, "notebook" : notebook}

      r = requests.post(url, data = data)
      r = json.loads(r.text)
      if "err" in r:        
        print(r["err"])
        return None   
      else:
        print("Your submission is successful.")
        print("Ref Id:",submission_id)
        print("Date of submission: ",r["date"])
        print("Time of submission: ", r["time"])
        print("View your submissions: https://aiml.iiith.talentsprint.com/notebook_submissions")
        # print("For any queries/discrepancies, please connect with mentors through the chat icon in LMS dashboard.")
      return submission_id
    else: submission_id
    

def getAdditional():
  try:
    if not Additional: 
      raise NameError
    else:
      return Additional  
  except NameError:
    print ("Please answer Additional Question")
    return None
def getComments():
  try:
    if not Comments:
      raise NameError
    else:
      return Comments
  except NameError:
    print ("Please answer Comments Question")
    return None

def getComplexity():
  try:
    if not Complexity:
      raise NameError
    else:
      return Complexity
  except NameError:
    print ("Please answer Complexity Question")
    return None
  
def getConcepts():
  try:
    if not Concepts:
      raise NameError
    else:
      return Concepts
  except NameError:
    print ("Please answer Concepts Question")
    return None

def getAnswer():
  try:
    if not Answer:
      raise NameError 
    else: 
      return Answer
  except NameError:
    print ("Please answer Question")
    return None

def getId():
  try: 
    return Id if Id else None
  except NameError:
    return None

def getPassword():
  try:
    return password if password else None
  except NameError:
    return None

submission_id = None
### Setup 
if getPassword() and getId():
  submission_id = submit_notebook()
  if submission_id:
    setup()
    
else:
  print ("Please complete Id and Password cells before running setup")


Setup completed successfully


### Import the required packages

In [17]:
pip install statsmodels 



In [18]:
# Import the library we need, which is Pandas and Matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodel
from statsmodels import api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: ignored

In [None]:
# Set some parameters to get good visuals - style to ggplot and size to 15,10
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 10)

### Load the Monthwise Quantity and Price csv file using Pandas

In [None]:
# Read the csv file of Monthwise Quantity and Price csv file we have.
df = pd.read_csv('MonthWiseMarketArrivals_Clean.csv')

### Preprocess the loaded data to get the required tidy dataframe

In [None]:
df.head()

In [9]:
# Changing the date column to a Time Interval columnn
df.date = pd.DatetimeIndex(df.date)
df.head()

NameError: ignored

In [None]:
# Change the index to the date column
df.index = pd.PeriodIndex(df.date, freq='M')
df.head()

In [8]:
# Sort the data frame by date
df = df.sort_values(by = "date")

NameError: ignored

### Select the city with maximum count for a market to get better data for time series analysis

In [None]:
cities, counts = np.unique(df.city.values, return_counts=True)

In [None]:
city = cities[np.argmax(counts)]

### Get the priceMod of this city  by dropping redundant columns

In [None]:
dfls = df.loc[df.city == city].copy()

In [None]:
dfls.columns

In [None]:
# Drop redundant columns
dfls = dfls.drop(["market", "month", "year", "state", "city", "priceMin", "priceMax"], axis = 1)

In [None]:
dfls.head()

### Transformation - Log

Transformations such as logarithms can help to stabilize the variance of a time series. 


### Apply the Log transformation to 'priceMod' column to stabilize the variance of a time series and add as a separate column

Plot the bar plots for  'priceMod' and 'priceModlog' 

In [None]:
dfls.priceMod.plot(kind = "hist", bins = 30)

In [None]:
dfls['priceModLog'] = np.log(dfls.priceMod)
dfls.head()

In [None]:
dfls.priceModLog.plot(kind = "hist", bins = 30)

### Visualize the line plot for the 'priceModLog' column

In [None]:
dfls.priceModLog.plot()



## Mean  Model

This very simple forecasting model will be called the "mean model"

### Applying mean to 'priceModLog' and take exponential and add as a separate column

In [None]:
model_mean_pred = dfls.priceModLog.mean()

In [None]:

# Let us store this as our Mean Predication Value
dfls["priceMean"] = np.exp(model_mean_pred)

### Plot the line plots of 'priceMod' and 'priceMean'

In [None]:
dfls.plot(kind="line", x="date", y = ["priceMod", "priceMean"])

### Use Root Mean Squared Error (RMSE) to calculate our error values

$RMSE = \Sigma \sqrt{ (\hat{y} - y)^2/n} $ , where $\hat{y}$ is  predicted value of y

### Define RMSE function

In [None]:
def RMSE(predicted, actual):
    mse = (predicted - actual)**2
    rmse = np.sqrt(mse.sum()/mse.count())
    return rmse

### Calculate RMSE of 'priceMean' and 'priceMod' and save it in a dataframe

In [None]:
model_mean_RMSE = RMSE(dfls.priceMean, dfls.priceMod)
model_mean_RMSE

In [None]:
# Save this in a dataframe
dflsResults = pd.DataFrame(columns = ["Model","RMSE"])
dflsResults.head()

In [None]:
dflsResults.loc[0,"Model"] = "Mean"
dflsResults.loc[0,"RMSE"] = model_mean_RMSE
dflsResults.head()

## Linear Trend Model

Let us start by plotting a linear trend model between priceModLog and time.
.

### Add a 'timeIndex' column in a month format

In [None]:

# What is the starting month of our data
dfls.date.min()

### Calculate the difference between the current date and initial / starting date

In [None]:
# Convert date in datetimedelta figure starting from zero
dfls["timeIndex"] = dfls.date - dfls.date.min()

In [None]:
dfls.head()

### Covert the format of timeIndex from days to months

In [None]:
# Convert to months using the timedelta function
dfls["timeIndex"] =  dfls["timeIndex"]/np.timedelta64(1, 'M')

In [None]:
dfls.head()

### Round the 'timeIndex' column to 0

In [None]:
# Round the number to 0
dfls["timeIndex"] = dfls["timeIndex"].round(0).astype(int)

### Calculate linear regression between 'priceModlog' and 'timeIndex'

In [None]:
## Now plot linear regression between priceModlog and timeIndex
model_linear = smf.ols('priceModLog ~ timeIndex', data = dfls).fit()

In [None]:
## Parameters for y = mx + c equation
model_linear.params
m = model_linear.params[0]
c = model_linear.params[1]

In [None]:
model_linear_pred = model_linear.predict()

In [None]:
# Plot the prediction line
dfls.plot(kind="line", x="timeIndex", y = "priceModLog")
plt.plot(dfls.timeIndex,model_linear_pred, '-')

In [None]:
dfls["priceLinear"] = np.exp(model_linear_pred)

In [None]:
dfls.head()

In [None]:
# Root Mean Squared Error (RMSE)
model_linear_RMSE = RMSE(dfls.priceLinear, dfls.priceMod)
model_linear_RMSE

In [None]:
#Storing the results
dflsResults.loc[1,"Model"] = "Linear"
dflsResults.loc[1,"RMSE"] = model_linear_RMSE
dflsResults.head()

In [None]:
dfls.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear"])

## Random Walk Model

When faced with a time series that shows irregular growth, the best strategy may not be to try to directly predict the level of the series at each period (i.e., the quantity Yt). Instead, it may be better to try to predict the change that occurs from one period to the next (i.e., the quantity Yt - Yt-1). That is, it may be better to look at the first difference of the series, to see if a predictable pattern can be found there. For purposes of one-period-ahead forecasting, it is just as good to predict the next change as to predict the next level of the series, since the predicted change can be added to the current level to yield a predicted level. The simplest case of such a model is one that always predicts that the next change will be zero, as if the series is equally likely to go up or down in the next period regardless of what it has done in the past.


**Random Walk Model** $ \hat{Y_t} = Y_{t-1} + \epsilon \\$

**Random Walk Model with drift** $$ \hat{Y_t} = Y_{t-1} + c + \epsilon \\$$

In [None]:
dfls["priceModLogShift1"] = dfls.priceModLog.shift()

In [None]:
dfls.head()

In [None]:
dfls.plot(kind= "scatter", y = "priceModLog", x = "priceModLogShift1", s = 50)

In [None]:
# Lets plot the one-month difference curve
dfls["priceModLogDiff"] = dfls.priceModLog - dfls.priceModLogShift1

In [None]:
dfls["priceRandom"] = np.exp(dfls.priceModLogShift1)
dfls.head()

In [None]:
dfls.plot(kind="line", x="timeIndex", y = ["priceMod","priceRandom"])

In [None]:
# Root Mean Squared Error (RMSE)
model_random_RMSE = RMSE(dfls.priceRandom, dfls.priceMod)
model_random_RMSE

In [None]:
dflsResults.loc[2,"Model"] = "Random"
dflsResults.loc[2,"RMSE"] = model_random_RMSE
dflsResults.head()

In [None]:
dfls.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear", "priceRandom"])


## Simple Exponential Smoothing Model (SES)

Instead of equally weighting each of the observation, in the SES model we give more weightage to the recent observations and less to the older ones. This is done by the using a smoothing variable like alpha

$$ \hat{y_t} = \alpha y_{t-1} + (1-\alpha)\hat{y_{t-1}} \\$$

In [None]:

dfls['priceModLogExp12'] = pd.ewma(dfls.priceModLog, halflife=12)

In [None]:
dfls.plot(kind ="line", y=["priceModLogExp12", "priceModLog"])

In [None]:
dfls["priceExp12"] = np.exp(dfls.priceModLogExp12)
dfls.tail()

In [None]:
# Root Mean Squared Error (RMSE)
model_Exp12_RMSE = RMSE(dfls.priceExp12, dfls.priceMod)
model_Exp12_RMSE

In [None]:
dflsResults.loc[3,"Model"] = "Exp Smoothing 12"
dflsResults.loc[3,"RMSE"] = model_Exp12_RMSE
dflsResults.head()

In [None]:
dfls.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear", 
                                             "priceRandom", "priceExp12"])

## Auto Regressive Models - AR(p)

In an autoregression model, we forecast the variable of interest using a linear combination of past values of the variable. The term autoregression indicates that it is a regression of the variable against itself.

Thus an autoregressive model of order (p) can be written as

$$ y_t = c + m_1y_{t-1} + m_2y_{t-2} + m_3y_{t-3} + .. \\$$

**Random walk model is an AR(1) model with ** $$m_1=1,  c = 0\\$$
**Random walk model with drift model ** $$m_1=1,  c \not= 0\\$$

We normally restrict autoregressive models to stationary data, and then some constraints on the values of the parameters are required.

For an AR(1) model:   $$ −1<m_1<−1 \\$$
For an AR(2) model:   
$$ −1<m_2<−1, m_1 + m_2 < 1, m_2 - m_1 <1 \\$$

## Moving Average Model - MA(q)

Rather than use past values of the forecast variable in a regression, a moving average model uses past forecast errors in a regression-like model.

$$ y_t=c+e_t+l_1 e_{t−1}+l_2 e_{t−2} + ... + l_q e_{t-q} \\$$

where `e` is white noise. We refer to this as an MA(`q`) model. Of course, we do not observe the values of e(t), so it is not really regression in the usual sense.

Notice that each value of `y(t)` can be thought of as a weighted moving average of the past few forecast errors. However, moving average models should not be confused with moving average smoothing. A moving average model is used for forecasting future values while moving average smoothing is used for estimating the trend-cycle of past values.

## ARIMA Model

If we combine differencing with autoregression and a moving average model, we obtain a non-seasonal ARIMA model. ARIMA is an acronym for AutoRegressive Integrated Moving Average model (“integration” in this context is the reverse of differencing). The full model can be written as

- **Number of AR (Auto-Regressive) terms (p)**: AR terms are just lags of dependent variable. For instance if p is 5, the predictors for y(t) will be y(t-1)….y(t-5).
- **Number of MA (Moving Average) terms (q)**: MA terms are lagged forecast errors in prediction equation. For instance if q is 5, the predictors for y(t) will be e(t-1)….e(t-5) where e(i) is the difference between the moving average at ith instant and actual value.
- **Number of Differences (d)**: These are the number of nonseasonal differences, i.e. in this case we took the first order difference. So either we can pass that variable and put d=0 or pass the original variable and put d=1. Both will generate same results.



In [None]:
ts = dfls.priceModLog
ts_diff = dfls.priceModLogDiff
ts_diff.dropna(inplace = True)

### Running the ARIMA Model 

In [None]:
from statsmodels.tsa.arima_model import ARIMA

In [None]:
ts_diff.head()

In [None]:
# Running the ARIMA Model(1,0,1)
model_AR1MA = ARIMA(ts_diff, order=(1,0,1))

In [None]:
results_ARIMA = model_AR1MA.fit(disp = -1)

In [None]:
ts_diff.plot()
results_ARIMA.fittedvalues.plot()

In [None]:
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)

In [None]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()

In [None]:
predictions_ARIMA_log = pd.Series(ts.ix[0], index=ts.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.tail()

In [None]:
dfls['priceARIMA'] = np.exp(predictions_ARIMA_log)

In [None]:
model_arima_RMSE = RMSE(dfls.priceARIMA, dfls.priceMod)

In [None]:
dflsResults.loc[4,"Model"] = "Arima"
dflsResults.loc[4,"RMSE"] = model_arima_RMSE
dflsResults.head()

In [None]:
dfls.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear", "priceRandom",
                                              "priceExp12", "priceARIMA"])

## MLP Model

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    
    n_vars = data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
            
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
n=10
Data = series_to_supervised(dfls.priceMod.values.reshape(-1,1),n)
X=Data.values[:, :n]
y=Data.values[:, n]

In [None]:
from joblib import dump, load
mlpr = load('ts_mlpr.joblib')
mlpr

In [None]:
ypred = mlpr.predict(X)

In [None]:
RMSE_MLP = RMSE(pd.Series(mlpr.predict(X)),pd.Series(y))

In [None]:
dfls['priceMLP'] = np.append(np.zeros(10),ypred)

In [None]:
dflsResults.loc[5,"Model"] = "MLP"
dflsResults.loc[5,"RMSE"] = RMSE_MLP
dflsResults

In [None]:
dfls.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMean", "priceLinear", "priceRandom",
                                              "priceExp12", "priceARIMA","priceMLP"])

### Sort the values by RMSE

In [None]:
dflsResults = dflsResults.sort_values(by='RMSE')

In [None]:
dflsResults.plot(kind="bar", x="Model", y = ["RMSE"])
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

In [None]:
dfls.plot(kind="line", x="timeIndex", y = ["priceMod", "priceMLP"]) 

In [None]:
dfls.plot(kind="line", x="timeIndex", y = ["priceMod", "priceARIMA"])

## Please answer the questions below to complete the experiment:

In [None]:
#@title How was the experiment? { run: "auto", form-width: "500px", display-mode: "form" }
Complexity = "" #@param ["","Too Simple, I am wasting time", "Good, But Not Challenging for me", "Good and Challenging for me", "Was Tough, but I did it", "Too Difficult for me"]


In [None]:
#@title If it was too easy, what more would you have liked to be added? If it was very difficult, what would you have liked to have been removed? { run: "auto", display-mode: "form" }
Additional = "" #@param {type:"string"}


In [None]:
#@title Can you identify the concepts from the lecture which this experiment covered? { run: "auto", vertical-output: true, display-mode: "form" }
Concepts = "" #@param ["","Yes", "No"]

In [None]:
#@title  Text and image description/explanation and code comments within the experiment: { run: "auto", vertical-output: true, display-mode: "form" }
Comments = "" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [None]:
#@title Run this cell to submit your notebook  { vertical-output: true }
try:
  if submission_id:
      return_id = submit_notebook()
      if return_id : submission_id =return_id
  else:
      print("Please complete the setup first.")
except NameError:
  print ("Please complete the setup first.")