In [1]:
# imports used throughout this notebook
from pyspark.sql import DataFrame, Column, Window
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
import os
import sys

# add homemade helpers
sys.path.insert(1, '../../scripts')
import helpers.join_helpers as jh
import helpers.plot_helpers as ph

# path where the data files are stored
DATA_PATH = '../../data'

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/16 23:45:18 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.22.169.117 instead (on interface eth0)
22/08/16 23:45:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/16 23:45:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# read in the aggregated covid data
covid_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/covid/aggregated/cases_by_week')
covid_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,borough,population,tot_cases,tot_pc_cases,tot_p100k_cases
2021,12,2022-01-01,209,0,Staten Island,493494.0,18157.0,0.03679274722691664,3679.274722691664
2021,5,2021-05-29,178,2,Staten Island,493494.0,177.0,3.586669746744641...,35.866697467446414
2020,4,2020-04-04,118,1,Staten Island,495522.0,2188.0,0.004415545626632118,441.5545626632117
2020,7,2020-07-25,134,2,Staten Island,495522.0,158.0,3.188556713929956...,31.885567139299567
2021,3,2021-03-13,167,2,Staten Island,493494.0,1532.0,0.003104394379668...,310.4394379668243


In [5]:
# read in the aggregated covid data
flu_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/flu/aggregated/cases_by_week')
flu_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,borough,population,tot_cases,tot_pc_cases,tot_p100k_cases
2019,2,2019-02-16,59,1,Bronx,1418207.0,934.0,6.585780496077089E-4,65.85780496077089
2019,1,2019-01-26,56,1,Bronx,1418207.0,857.0,6.042841418777372E-4,60.42841418777372
2019,1,2019-01-12,54,1,Bronx,1418207.0,493.0,3.476220326087799...,34.762203260878
2019,11,2019-11-30,100,1,Bronx,1418207.0,217.0,1.530101036026475...,15.301010360264756
2019,3,2019-03-16,63,1,Bronx,1418207.0,554.0,3.906340893818744...,39.06340893818744


In [3]:
# read in the aggregated yellow tlc data
tlc_pu_df = spark.read.parquet(f'{DATA_PATH}/curated/tlc/aggregated/yellow/by_pu')
tlc_pu_df.limit(5)

                                                                                

week_year,week_month,week_ending,week_index,timeline,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,avg_passengers,avg_trip_distance
2020,9,2020-09-19,142,2,Brooklyn,2727393.0,2290,0.0008396296389995868,83.96296389995868,1.3458515283842796,2.9860742358078607
2021,2,2021-02-20,164,2,Manhattan,1576876.0,291970,0.1851572349379406,18515.72349379406,1.4186286262287222,2.1355327944652136
2020,9,2020-09-19,142,2,Staten Island,495522.0,39,7.870488091346096e-05,7.870488091346096,1.0,31.242307692307687
2021,1,2021-01-09,158,2,Staten Island,493494.0,55,0.0001114501898705962,11.14501898705962,1.0363636363636364,30.25254545454545
2020,9,2020-09-26,143,2,Bronx,1466438.0,904,0.0006164597480425358,61.64597480425357,1.3152654867256637,3.758019911504428


In [6]:
# join the datasets by week_index
joined_pu_df = jh.join_by_week_by_borough(tlc_pu_df, covid_df, 'covid')
joined_pu_df = jh.join_by_week_by_borough(joined_pu_df, flu_df, 'flu')

In [None]:
# filter for only timeline 2
joined_pu_df = joined_pu_df.where(F.col('timeline') == 2)

In [9]:
joined_pu_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,avg_passengers,avg_trip_distance,covid_week_year,covid_week_month,covid_week_ending,covid_week_index,covid_timeline,covid_borough,covid_population,covid_tot_cases,covid_tot_pc_cases,covid_tot_p100k_cases,flu_week_year,flu_week_month,flu_week_ending,flu_week_index,flu_timeline,flu_borough,flu_population,flu_tot_cases,flu_tot_pc_cases,flu_tot_p100k_cases


In [10]:
# remove all columns except those being fitted
joined_pu_df = joined_pu_df.select(
    'avg_trip_distance',
    'week_index',
    'pu_borough',
    'covid_tot_p100k_cases',
    'flu_tot_p100k_cases'
)

In [11]:
normal_lm = ols(
    formula = 'avg_trip_distance ~ week_index + pu_borough * covid_tot_p100k_cases + pu_borough * flu_tot_p100k_cases',
    data = joined_pu_df.toPandas()
).fit()


ValueError: negative dimensions are not allowed

In [None]:
print(normal_lm.summary())

In [None]:
# table = sm.stats.anova_lm(moore_lm, typ=2)
table = sm.stats.anova_lm(normal_lm, typ=2)
table

In [None]:
# test for interaction
