### MAST30034: Applied Data Science Project 1
---
# Data Analysis: Geospatial Mapping of Average Trip Radiuses/Distances
#### Xavier Travers (1178369)

Self-explanatory

In [None]:
# imports used throughout this notebook
import sys
import geopandas as gpd

# add homemade helpers
sys.path.insert(1, '../../scripts')
import helpers.join_helpers as jh
import helpers.plot_helpers as ph

# path where the data files are stored
DATA_PATH = '../../data'

In [None]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [None]:
# read the geojson borough outlines
borough_gj = gpd.read_file(f'{DATA_PATH}/raw/tlc_zones/boroughs.geojson')
borough_gj.head()

In [None]:
# count the number of rows/columns
borough_gj.info()

In [None]:
# generate the borough centroids
borough_gj['centroid'] = borough_gj['geometry'].apply(lambda gs: (gs.centroid.y, gs.centroid.x))
borough_gj.head()

In [None]:
# read in the aggregated covid dataset
covid_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/covid/aggregated/cases_by_week')
covid_df.limit(5)

In [None]:
# read in the flu data
flu_df = spark.read.parquet(f'{DATA_PATH}/curated/virals/flu/aggregated/cases_by_week')
flu_df.limit(5)

### By Pickup Borough

In [None]:
# read in the aggregated tlc by pickup dataset
tlc_pu_df = spark.read.parquet(f'{DATA_PATH}/curated/tlc/aggregated/yellow/by_pu')
tlc_pu_df.limit(5)

In [None]:
# merge the tlc, and covid data by week preceding 
# (i.e. the covid and flu cases of week one are join to the taxi data from 
# week two). 
# shows whether there is an immediate weekly correlation due to the viruses.
joined_pu_df = jh.join_by_week_by_borough(tlc_pu_df, covid_df, 'covid')
joined_pu_df = jh.join_by_week_by_borough(joined_pu_df, flu_df, 'flu')

In [None]:
# plot and save the max covid19 cases distances
ph.geospatial_distances_when_max(joined_pu_df.toPandas(),
    borough_gj, 'covid_tot_p100k_cases', 'covid', 
    'Maximum COVID-19 Cases Per 100k People (by MMWR Week)')

In [None]:
# plot and save the max flu cases distances
ph.geospatial_distances_when_max(joined_pu_df.toPandas(),
    borough_gj, 'flu_tot_p100k_cases', 'flu', 
    'Maximum Influenza Cases Per 100k People (by MMWR Week)')

In [None]:
# Create a map based on just the average trip radius over the whole time period
ph.geospatial_average_distance(joined_pu_df.toPandas(), borough_gj)