# Initialization
---

### Loading the required Libraries for Time-Series Analysis

In [0]:
%pip install FBProphet

Python interpreter will be restarted.
Collecting FBProphet
  Downloading fbprophet-0.7.1.tar.gz (64 kB)
Collecting cmdstanpy==0.9.5
  Downloading cmdstanpy-0.9.5-py3-none-any.whl (37 kB)
Building wheels for collected packages: FBProphet
  Building wheel for FBProphet (setup.py): started
  Building wheel for FBProphet (setup.py): still running...
  Building wheel for FBProphet (setup.py): finished with status 'done'
  Created wheel for FBProphet: filename=fbprophet-0.7.1-py3-none-any.whl size=9534407 sha256=c6969c7343a4ed7402d11eb79514880f3fe115c42715e399ec963fdb43a73935
  Stored in directory: /root/.cache/pip/wheels/d0/d2/ae/c579b7fd160999d35908f3cb8ebcad7ef64ecaca7b78e4c3c8
Successfully built FBProphet
Installing collected packages: cmdstanpy, FBProphet
  Attempting uninstall: cmdstanpy
    Found existing installation: cmdstanpy 0.9.68
    Not uninstalling cmdstanpy at /databricks/python3/lib/python3.8/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-cd6ef

In [0]:
import logging
logging.getLogger('py4j').setLevel(logging.ERROR)

### Loading the data

In [0]:
path = "dbfs:/FileStore/shared_uploads/sgoswam@gmu.edu/df1_sample.csv"    #path to df1_sample.csv

In [0]:
df1 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(path)

In [0]:
df1.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- SPEED: double (nullable = true)
 |-- DATA_AS_OF: string (nullable = true)
 |-- LINK_ID: integer (nullable = true)
 |-- NewDateTime: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- weekday: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- day_of_year: integer (nullable = true)



In [0]:
df1 = df1.withColumnRenamed("_c0", "ID")
df1 = df1.withColumnRenamed("NewDateTime", "ds")
df1 = df1.withColumnRenamed("SPEED", "y")
df1 = df1.withColumnRenamed("LINK_ID", "link_id")

In [0]:
from pyspark.sql.types import StringType
df1 = df1.withColumn("link_id",df1.link_id.cast('string'))

In [0]:
df = df1.select('ds', 'link_id', 'y')

In [0]:
from pyspark.sql.functions import count, col

cleaning_filters = (df.ds >= '2018-01-01') & (df.ds < '2022-01-01') & (df.y < 68) & (df.y > 0)

In [0]:
# Filtering rows to clean the data
df = df.filter( cleaning_filters )

In [0]:
df.cache()
df.printSchema()
df.show(10)

root
 |-- ds: timestamp (nullable = true)
 |-- link_id: string (nullable = true)
 |-- y: double (nullable = true)

+-------------------+-------+-----+
|                 ds|link_id|    y|
+-------------------+-------+-----+
|2018-05-26 22:08:04|4616204|41.63|
|2020-12-27 11:47:04|4763652| 9.94|
|2018-07-06 13:18:06|4362252|32.31|
|2021-06-07 01:39:11|4616197|52.81|
|2020-04-23 05:33:04|4616204|44.11|
|2019-09-15 00:57:11|4616281|56.54|
|2020-07-03 03:48:04|4616364|64.62|
|2019-02-16 13:18:03|4616340|27.96|
|2018-11-27 09:38:37|4616226|44.11|
|2020-06-26 19:33:09|4620314|21.12|
+-------------------+-------+-----+
only showing top 10 rows



In [0]:
df.createOrReplaceTempView('data')

# Time-Seriers Analysis | Sample Dataset
NOTE: [Due to memory and compute resource limitation of Databricks Community Edition, Cross-Validatation and Hyperparameter Tuning is carried out on Sample Data]

## Training Model and performing Cross-Validation with Hyper-Parameter Tuning on Sample Data

In [0]:
df_pandas = df.toPandas()

In [0]:
import itertools
import numpy as np
import pandas as pd
from fbprophet import Prophet
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics


param_grid = {  
    'changepoint_prior_scale': [0.01, 0.1, 0.5],
    'seasonality_prior_scale': [0.01, 0.1],
}

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []  # Store the RMSEs for each params here

# Use cross validation to evaluate all parameters
for params in all_params:
    m = Prophet(**params).fit(df_pandas)  # Fit model with given params
    df_cv = cross_validation(m, horizon='180 days', parallel="threads")
    df_p = performance_metrics(df_cv, rolling_window=1)
    rmses.append(df_p['rmse'].values[0])

# Find the best parameters
tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
print(tuning_results)

INFO:prophet:Making 9 forecasts with cutoffs between 2019-07-15 02:14:13 and 2021-07-04 02:14:13
INFO:prophet:Applying in parallel with <concurrent.futures.thread.ThreadPoolExecutor object at 0x7fa92b139dc0>
INFO:prophet:Making 9 forecasts with cutoffs between 2019-07-15 02:14:13 and 2021-07-04 02:14:13
INFO:prophet:Applying in parallel with <concurrent.futures.thread.ThreadPoolExecutor object at 0x7fa92aad31c0>
INFO:prophet:Making 9 forecasts with cutoffs between 2019-07-15 02:14:13 and 2021-07-04 02:14:13
INFO:prophet:Applying in parallel with <concurrent.futures.thread.ThreadPoolExecutor object at 0x7fa92b281340>
INFO:prophet:Making 9 forecasts with cutoffs between 2019-07-15 02:14:13 and 2021-07-04 02:14:13
INFO:prophet:Applying in parallel with <concurrent.futures.thread.ThreadPoolExecutor object at 0x7fa9147f5fa0>
INFO:prophet:Making 9 forecasts with cutoffs between 2019-07-15 02:14:13 and 2021-07-04 02:14:13
INFO:prophet:Applying in parallel with <concurrent.futures.thread.Threa

In [0]:
display(tuning_results)

changepoint_prior_scale,seasonality_prior_scale,rmse
0.01,0.01,15.796658470816157
0.01,0.1,15.857775895599811
0.1,0.01,15.935684051985374
0.1,0.1,16.00211449559187
0.5,0.01,16.052532924229794
0.5,0.1,15.93716293179397


In [0]:
# Getting best parameters to train the model
print(tuning_results[tuning_results.rmse == tuning_results.rmse.min()])

   changepoint_prior_scale  seasonality_prior_scale       rmse
0                     0.01                     0.01  15.796658


## Note the value of these ^ for further analysis