### MAST30034: Applied Data Science Project 1
---
# Data Analysis: Generating the Model for Trip Distances
#### Xavier Travers (1178369)

In this notebook, an Ordinary Least Squares linear model and a Gamma generalized linear regression are generated
on the additive model with interaction.

Trip distance is modelled against:
- Borough
- Preceding Week's index
- COVID-19 Case Rate (per 100'000 people)
    - Interaction with the Borough
- Influenza Case Rate (per 100'000 people)
    - Interaction with the Borough

In [None]:
# imports used throughout this notebook
from pyspark.sql import functions as F
import statsmodels.api as sm
from statsmodels.formula.api import ols, glm
from statsmodels.genmod.families.family import Gamma
from statsmodels.genmod.families import links
import sys

# add homemade helpers
sys.path.insert(1, '../../scripts')
import helpers.join_helpers as jh
import helpers.plot_helpers as ph

# path where the data files are stored
DATA_PATH = '../../data'

In [None]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [None]:
# read in the aggregated covid data
covid_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/covid/aggregated/cases_by_week')
covid_df.limit(5)

In [None]:
# read in the aggregated covid data
flu_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/flu/aggregated/cases_by_week')
flu_df.limit(5)

In [None]:
# read in the aggregated yellow tlc data
tlc_pu_df = spark.read.parquet(f'{DATA_PATH}/curated/tlc/aggregated/yellow/by_pu')
tlc_pu_df.limit(5)

In [None]:
# join the datasets by week_index
joined_pu_df = jh.join_by_week_by_borough(tlc_pu_df, covid_df, 'covid')
joined_pu_df = jh.join_by_week_by_borough(joined_pu_df, flu_df, 'flu')

In [None]:
# check that the data doesn't look weird
joined_pu_df.limit(5)

In [None]:
# remove all columns except those being fitted
joined_pu_df = joined_pu_df.select(
    F.col('avg_trip_distance'),
    (F.col('week_index') - 1).alias('pre_week_index'),
    F.col('pu_borough'),
    F.col('covid_tot_p100k_cases'),
    F.col('flu_tot_p100k_cases')
)

In [None]:
# convert the dataframe to pandas type
joined_pu_df = joined_pu_df.toPandas()

### OLS Linear Model

In [None]:
# generate the linear model as described above
gaussian_model = ols(
    formula = 'avg_trip_distance ~ pre_week_index + pu_borough * covid_tot_p100k_cases + pu_borough * flu_tot_p100k_cases',
    data = joined_pu_df
).fit()

In [None]:
# show summary information
print(gaussian_model.summary())

In [None]:
# generate an ANOVA table
table = sm.stats.anova_lm(gaussian_model, typ=2)
table

The borough and COVID-19 cases interaction terms are not relevant.

In [None]:
# show and plot the diagnostic observed vs fitted
ph.diagnostic_observed_fitted(joined_pu_df, gaussian_model, 'avg_trip_distance',
    'Average Weekly Trip Distance (Miles)')

### Gamma GLM

In [None]:
gamma_model = glm(
    formula = 'avg_trip_distance ~ pre_week_index + pu_borough * covid_tot_p100k_cases + pu_borough * flu_tot_p100k_cases',
    data = joined_pu_df,
    family = Gamma(link = links.inverse_power())
).fit()

In [None]:
print(gamma_model.summary())

In [None]:
# show and plot the diagnostic observed vs fitted
ph.diagnostic_observed_fitted(joined_pu_df, gamma_model, 'avg_trip_distance',
    'Average Weekly Trip Distance (Miles)')