# Exploratory Data Analysis

In this section, we seek to find any significant confounding factor that may drive the demand in specific location. This section will help us develop deeper understanding of the data and narrow down the significant features for our predictions/classifications later.

In [29]:
import sys
sys.path.append('../')
from scripts.preprocess import preprocess, transform_demand
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import BooleanType
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime


%matplotlib inline
sns.set(style='whitegrid', palette='pastel', color_codes=True)
sns.mpl.rc('figure', figsize=(10,6))

In [2]:
# Starting a Spark session
spk = (
    SparkSession.builder.appName('Playground')
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

# Read the parquet dataset
df = spk.read.parquet('/Users/oliver/Downloads/MAST30034_Python-main/data/tlc_data')


# TLC Taxi Dataset

In [3]:
# Clean the raw TLC taxi dataset
df = preprocess(df)

# Get the daily demand data
pickup_daily_demand, dropoff_daily_demand = transform_demand(df)


# Get the hourly demand data
pickup_hourly_demand, dropoff_hourly_demand = transform_demand(df, 'hour')

In [5]:
pickup_daily_demand

pu_location_id,pickup_date,count
55,2022-03-01,3
229,2022-03-02,2360
113,2022-03-02,1493
225,2022-03-02,7
34,2022-03-03,3
191,2022-03-05,3
235,2022-03-05,6
141,2022-03-07,2339
129,2022-03-07,12
125,2022-03-11,594


## Summary Statistics

In [6]:
# Tabulate summary statistics
operations = [func.mean, func.stddev, func.min, func.max]
exprs = [f(func.col("count")) for f in operations]

location_hourly_statistics = pickup_hourly_demand.groupby(['pu_location_id', 'pickup_date']).agg(*exprs)
location_daily_statistics = pickup_daily_demand.groupby('pu_location_id').agg(*exprs)

In [9]:
# Examine the statistics of hourly demand focusing on standard deviation
location_hourly_statistics.sort(func.col("stddev_samp(count)").desc())

pu_location_id,pickup_date,avg(count),stddev_samp(count),min(count),max(count)
161,2022-03-23,242.0,198.01602921502837,3,597
237,2021-11-03,282.5833333333333,196.65173733557876,2,586
79,2021-11-07,166.91666666666666,192.23602265391648,18,938
237,2021-11-04,280.3333333333333,190.77614167364865,2,551
161,2022-03-24,243.91666666666663,190.27689586875564,3,567
161,2022-03-29,238.95833333333331,190.247545643008,3,605
237,2021-11-17,277.375,190.2471647506269,3,531
161,2022-03-15,223.45833333333331,190.1024847553154,1,595
236,2021-11-23,239.33333333333331,188.02674802010085,3,566
237,2021-11-18,279.375,187.8255000745662,2,516


In [10]:
# Examine the statistics of daily demand focusing on standard deviation
location_daily_statistics.sort(func.col("stddev_samp(count)").desc())

pu_location_id,avg(count),stddev_samp(count),min(count),max(count)
237,4683.061320754717,1307.846125931082,762,6907
161,3828.929245283018,1120.9565734317307,526,5854
79,2544.7971698113206,1117.1535517515056,758,5331
236,4235.278301886792,1107.4944621948969,779,6051
132,3939.3490566037735,892.9362878027387,420,5887
162,3157.627358490566,865.726562682048,562,4790
138,2717.8632075471696,713.2700047545486,50,4373
249,2105.0707547169814,671.2235765799452,631,3763
163,2783.9150943396226,666.7183495943195,550,3979
142,3225.221698113208,664.099813274643,1036,4552


Here we notice that most of the locations with high standard deviations are located at city center. We hypothesize that this is due to the confounding effects of day in the week (more crowded in the city during weekends and public holidays) as well as weather of the day.

To check if this is the case, we can split the daily demand dataset into weekdays and weekends to see if the standard deviation reduces significantly.

In [32]:
# Adding weekday check
# @func.udf(BooleanType)
# def weekday(date):
#     return date.weekday() < 5

weekday_udf = func.udf(lambda x: x.weekday() < 5)

pickup_daily_demand = pickup_daily_demand.withColumn(
    'weekday',
    weekday_udf('pickup_date')
)

In [42]:
import os
os.environ["PYSPARK_PYTHON"] = '/opt/anaconda3/bin/python3'
os.environ["PYSPARK_DRIVER_PYTHON"] = '/opt/anaconda3/bin/python3'

In [41]:
pickup_daily_demand

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 473, in main
    raise Exception(("Python in worker has different version %s than that in " +
Exception: Python in worker has different version 3.8 than that in driver 3.7, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.


DataFrame[pu_location_id: bigint, pickup_date: date, count: bigint, weekday: string]

In [24]:
weekday(datetime.datetime(year=2022, month=3, day=4))

True

In [13]:
weekday('2022-2-15')

AttributeError: 'str' object has no attribute 'weekday'