In [1]:
# imports used throughout this notebook
from pyspark.sql import DataFrame, Column, Window
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
import os
import sys

# add homemade helpers
sys.path.insert(1, '../scripts')
import helpers.join_helpers as jh
import helpers.plot_helpers as ph

DEBUGGING = False

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/15 19:29:01 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 172.22.165.153 instead (on interface eth0)
22/08/15 19:29:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/15 19:29:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/15 19:29:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# read in the aggregated yellow tlc data
tlc_pu_df = spark.read.parquet('../data/curated/tlc/aggregated/yellow/by_pu')
tlc_pu_df.limit(5)

                                                                                

week_year,week_month,week_ending,week_index,timeline,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,avg_passengers,avg_trip_distance
2020,4,2020-04-25,121,keep for graphing,Queens,2395791.0,1612,0.0006728466715168394,67.28466715168393,1.1457816377171215,6.7971650124069445
2020,4,2020-04-04,118,keep for graphing,Queens,2395791.0,2342,0.0009775477076255817,97.75477076255817,1.231426131511529,7.541665243381725
2020,7,2020-07-25,134,post,Brooklyn,2727393.0,1930,0.0007076354599428832,70.76354599428832,1.2279792746113989,2.8218341968911957
2020,6,2020-06-13,128,post,Brooklyn,2727393.0,1313,0.0004814121030595884,48.14121030595884,1.300837776085301,3.505689261233818
2020,4,2020-04-04,118,keep for graphing,Manhattan,1687834.0,51640,0.0305954258534903,3059.5425853490333,1.2797637490317584,2.3961254841208284


In [4]:
# read in the aggregated covid data
covid_df = spark.read.parquet('../data/curated/virals/covid/aggregated/cases-by-week')
covid_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,borough,population,tot_cases,tot_pc_cases,tot_p100k_cases
2020,4,2020-04-18,120,keep for graphing,Staten Island,495522.0,1688.0,0.003406508691844156,340.6508691844156
2021,3,2021-03-20,168,post,Staten Island,493494.0,1514.0,0.003067919772074...,306.791977207423
2020,5,2020-05-23,125,keep for graphing,Staten Island,495522.0,293.0,5.912956437857451E-4,59.129564378574514
2020,12,2020-12-19,155,post,Staten Island,495522.0,1954.0,0.003943316341151352,394.3316341151352
2021,2,2021-02-20,164,post,Staten Island,493494.0,1244.0,0.002520800658164...,252.08006581640305


In [5]:
# read in the aggregated covid data
flu_df = spark.read.parquet('../data/curated/virals/flu/aggregated/cases-by-week')
flu_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,borough,population,tot_cases,tot_pc_cases,tot_p100k_cases
2020,3,2020-03-21,116,keep for graphing,Staten Island,495522.0,30.0,6.054221608727766E-5,6.054221608727766
2019,5,2019-05-11,71,pre,Bronx,1418207.0,18.0,1.269208232648689...,1.2692082326486895
2019,12,2019-12-14,102,keep for graphing,Manhattan,1628706.0,259.0,1.590219474846903E-4,15.90219474846903
2021,12,2021-12-18,207,neither,Staten Island,493494.0,207.0,4.194579873311529...,41.945798733115296
2020,10,2020-10-31,148,post,Bronx,1466438.0,2.0,1.363849000094105...,0.1363849000094105


In [6]:
# join the datasets by week_index
joined_pu_df = jh.join_by_week_by_borough(tlc_pu_df, covid_df, 'covid')
joined_pu_df = jh.join_by_week_by_borough(joined_pu_df, flu_df, 'flu')

In [7]:
# show it
joined_pu_df.limit(5)

22/08/15 19:29:10 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


week_year,week_month,week_ending,week_index,timeline,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,avg_passengers,avg_trip_distance,covid_week_year,covid_week_month,covid_week_ending,covid_week_index,covid_timeline,covid_borough,covid_population,covid_tot_cases,covid_tot_pc_cases,covid_tot_p100k_cases,flu_week_year,flu_week_month,flu_week_ending,flu_week_index,flu_timeline,flu_borough,flu_population,flu_tot_cases,flu_tot_pc_cases,flu_tot_p100k_cases
2020,4,2020-04-25,121,keep for graphing,Queens,2395791.0,1612,0.0006728466715168394,67.28466715168393,1.1457816377171215,6.7971650124069445,2020,4,2020-04-18,120,keep for graphing,Queens,2395791.0,7313.0,0.003052436543922237,305.2436543922237,,,,,,,,,,
2020,4,2020-04-04,118,keep for graphing,Queens,2395791.0,2342,0.0009775477076255817,97.75477076255817,1.231426131511529,7.541665243381725,2020,3,2020-03-28,117,keep for graphing,Queens,2395791.0,9269.0,0.003868868361221826,386.8868361221826,2020.0,3.0,2020-03-28,117.0,keep for graphing,Queens,2395791.0,33.0,1.377415642683355...,1.377415642683356
2020,7,2020-07-25,134,post,Brooklyn,2727393.0,1930,0.0007076354599428832,70.76354599428832,1.2279792746113989,2.8218341968911957,2020,7,2020-07-18,133,post,Brooklyn,2727393.0,693.0,2.540887946841544...,25.40887946841544,,,,,,,,,,
2020,6,2020-06-13,128,post,Brooklyn,2727393.0,1313,0.0004814121030595884,48.14121030595884,1.300837776085301,3.505689261233818,2020,6,2020-06-06,127,post,Brooklyn,2727393.0,959.0,3.516178269871632E-4,35.16178269871632,,,,,,,,,,
2020,4,2020-04-04,118,keep for graphing,Manhattan,1687834.0,51640,0.0305954258534903,3059.5425853490333,1.2797637490317584,2.3961254841208284,2020,3,2020-03-28,117,keep for graphing,Manhattan,1687834.0,3722.0,0.002205193164730...,220.5193164730655,2020.0,3.0,2020-03-28,117.0,keep for graphing,Manhattan,1687834.0,14.0,8.29465456911047E-6,0.8294654569110469


In [8]:
# filter for only timeline 3
joined_pu_df = joined_pu_df.where(F.col('timeline') == 'post')

In [9]:
# show it
joined_pu_df.limit(5)

week_year,week_month,week_ending,week_index,timeline,pu_borough,population,num_trips,num_pc_trips,num_p100k_trips,avg_passengers,avg_trip_distance,covid_week_year,covid_week_month,covid_week_ending,covid_week_index,covid_timeline,covid_borough,covid_population,covid_tot_cases,covid_tot_pc_cases,covid_tot_p100k_cases,flu_week_year,flu_week_month,flu_week_ending,flu_week_index,flu_timeline,flu_borough,flu_population,flu_tot_cases,flu_tot_pc_cases,flu_tot_p100k_cases
2020,7,2020-07-25,134,post,Brooklyn,2727393.0,1930,0.0007076354599428832,70.76354599428832,1.2279792746113989,2.8218341968911957,2020,7,2020-07-18,133,post,Brooklyn,2727393.0,693.0,2.540887946841544...,25.40887946841544,,,,,,,,,,
2020,6,2020-06-13,128,post,Brooklyn,2727393.0,1313,0.0004814121030595884,48.14121030595884,1.300837776085301,3.505689261233818,2020,6,2020-06-06,127,post,Brooklyn,2727393.0,959.0,3.516178269871632E-4,35.16178269871632,,,,,,,,,,
2020,7,2020-08-01,135,post,Brooklyn,2727393.0,2150,0.0007882985693664242,78.82985693664243,1.2237209302325582,2.4419441860465145,2020,7,2020-07-25,134,post,Brooklyn,2727393.0,488.0,1.789254427213093...,17.892544272130934,,,,,,,,,,
2020,7,2020-07-18,133,post,Manhattan,1687834.0,151282,0.089630852323155,8963.0852323155,1.37563622902923,2.318460491003537,2020,7,2020-07-11,132,post,Manhattan,1687834.0,494.0,2.926828112243265...,29.268281122432654,,,,,,,,,,
2020,6,2020-06-20,129,post,Manhattan,1687834.0,104550,0.0619432953714642,6194.3295371464255,1.3489813486370157,2.385740985174549,2020,6,2020-06-13,128,post,Manhattan,1687834.0,367.0,2.174384447759673E-4,21.74384447759673,,,,,,,,,,


In [10]:
# remove all columns except those being fitted
joined_pu_df = joined_pu_df.select(
    'avg_trip_distance',
    'week_index',
    'pu_borough',
    'covid_tot_p100k_cases',
    'flu_tot_p100k_cases'
)

In [11]:
normal_lm = ols(
    formula = 'avg_trip_distance ~ week_index + pu_borough * covid_tot_p100k_cases + pu_borough * flu_tot_p100k_cases',
    data = joined_pu_df.toPandas()
).fit()


In [12]:
print(normal_lm.summary())

                            OLS Regression Results                            
Dep. Variable:      avg_trip_distance   R-squared:                       0.989
Model:                            OLS   Adj. R-squared:                  0.988
Method:                 Least Squares   F-statistic:                     936.6
Date:                Mon, 15 Aug 2022   Prob (F-statistic):          6.30e-143
Time:                        19:29:12   Log-Likelihood:                -235.95
No. Observations:                 170   AIC:                             503.9
Df Residuals:                     154   BIC:                             554.1
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                                        coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------

In [15]:
# table = sm.stats.anova_lm(moore_lm, typ=2)
table = sm.stats.anova_lm(normal_lm, typ=2)
table

Unnamed: 0,sum_sq,df,F,PR(>F)
pu_borough,12684.293728,4.0,3056.4685,1.5370029999999999e-145
week_index,0.97451,1.0,0.93929,0.3339801
covid_tot_p100k_cases,38.206126,1.0,36.825328,9.635711e-09
pu_borough:covid_tot_p100k_cases,24.558254,4.0,5.917675,0.0001862774
flu_tot_p100k_cases,0.637668,1.0,0.614622,0.4342569
pu_borough:flu_tot_p100k_cases,1.469776,4.0,0.354164,0.8408334
Residual,159.774363,154.0,,


In [14]:
# test for interaction
