### MAST30034: Applied Data Science Project 1
---
# Aggregating TLC Data
#### Xavier Travers (1178369)

Aggregate the TLC data by month.
This means counting trips to and from each of the boroughs per month.
This is done for each of the taxi types.

In [1]:
# imports used throughout this notebook
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
import os
import re
from itertools import chain

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('MAST30034 XT Project 1')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .getOrCreate()
)

22/07/31 01:06:05 WARN Utils: Your hostname, Polaris resolves to a loopback address: 127.0.1.1; using 192.168.153.180 instead (on interface eth0)
22/07/31 01:06:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/31 01:06:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/07/31 01:06:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/07/31 01:06:06 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
# read in the taxi zones dataset
zones_df = spark.read.parquet('../data/raw/tlc_zones/zones')

In [4]:
def prefix_column_names(df: DataFrame, prefix: str) -> DataFrame:
    """ Add a prefix to the columns names of a `DataFrame`.

    Args:
    - df (`DataFrame`): The `DataFrame` for which to add prefixes
    - prefix (str): The prefix

    Returns:
        `DataFrame`: The modified `DataFrame`
    """

    out_df = df
    for col in df.columns:
        out_df = out_df.withColumnRenamed(col, prefix + col)
    return out_df

In [5]:
# define the tlc dataset names
TLC_NAMES = ['yellow', 'green', 'fhv', 'fhvhv']

In [6]:
def add_borough_names(df: DataFrame) -> DataFrame:
    location_id_colname = 'LocationID'

    required_zone_colnames = [
        'LocationID',
        'borough',
        'zone']

    out_df = df
    for prefix in ['PU', 'DO']:
        out_df = out_df.join(
            prefix_column_names(zones_df.select(
                required_zone_colnames), prefix),
            on=prefix + location_id_colname,
            how='inner'
        )

    return out_df


In [7]:
OUT_COL_NAMES = [
    'year', # year group
    'month', # month group
    'type', # taxi type group
    'PUborough', # pickup borough group
    'DOborough', # dropoff borough group
    # grouping by sharing configuration 
    'shared', # (bool for fhvhv and #passengers for yellow/green)
    'total_trips', # total trips in the group 
    'avg_distance', # avg distance
]

In [8]:
def aggregate_trips_green_yellow(df: DataFrame, year:int, month:int, taxi_type:str) -> DataFrame:
    join_selections = ['PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance']
    joined_df = df\
        .select(join_selections)\
        .join(prefix_column_names(zones_df, 'PU'), 'PULocationID', 'inner')\
        .join(prefix_column_names(zones_df, 'DO'), 'DOLocationID', 'inner')

    group_filters = ['PUborough', 'DOborough', 'passenger_count']
    grouped_df = joined_df.groupBy(group_filters)
    group_counts = grouped_df.count().collect()
    group_avg_distances = grouped_df.avg('trip_distance').collect()
    group_row_lists = [group_counts, group_avg_distances]

    out_df = None
    for row_list in group_row_lists:
        temp_df = spark.createDataFrame(row_list)
        if(out_df == None):
            out_df = temp_df
        else:
            out_df = out_df\
                .join(temp_df, group_filters, 'inner')

    return out_df\
        .withColumn('type', F.lit(taxi_type))\
        .withColumn('year', F.lit(year))\
        .withColumn('month', F.lit(month))\
        .withColumnRenamed('count', 'total_trips')\
        .withColumnRenamed('passenger_count', 'shared')\
        .withColumnRenamed('avg(trip_distance)', 'avg_distance')\
        .select(OUT_COL_NAMES)

In [9]:
def aggregate_trips_fhvhv(df: DataFrame, year:int, month:int) -> DataFrame:
    join_selections = ['PULocationID', 'DOLocationID', 'shared_request_flag', 'trip_miles', 'hvfhs_license_num']
    joined_df = df\
        .select(join_selections)\
        .join(prefix_column_names(zones_df, 'PU'), 'PULocationID', 'inner')\
        .join(prefix_column_names(zones_df, 'DO'), 'DOLocationID', 'inner')

    group_filters = ['PUborough', 'DOborough', 'hvfhs_license_num', 'shared_request_flag']
    grouped_df = joined_df.groupBy(group_filters)
    group_counts = grouped_df.count().collect()
    group_avg_sums = grouped_df.sum('trip_miles').collect()
    group_avg_distances = grouped_df.avg('trip_miles').collect()
    group_row_lists = [group_counts, group_avg_sums, group_avg_distances]

    out_df = None
    for row_list in group_row_lists:
        temp_df = spark.createDataFrame(row_list)
        if(out_df == None):
            out_df = temp_df
        else:
            out_df = out_df\
                .join(temp_df, group_filters, 'inner')

    licenses_dict = {
        'HV0002': 'juno',
        'HV0003': 'uber',
        'HV0004': 'via',
        'HV0005': 'lyft'
    }

    # from: https://stackoverflow.com/questions/42980704/pyspark-create-new-column-with-mapping-from-a-dict
    license_mapping_expr = F.create_map([F.lit(x) for x in chain(*licenses_dict.items())])

    flags_dict = {
        'Y': 1.0,
        'N': 0.0
    }

    # from: https://stackoverflow.com/questions/42980704/pyspark-create-new-column-with-mapping-from-a-dict
    flag_mapping_expr = F.create_map([F.lit(x) for x in chain(*flags_dict.items())])

    return out_df\
        .withColumn('year', F.lit(year))\
        .withColumn('month', F.lit(month))\
        .withColumn('type', license_mapping_expr[F.col('hvfhs_license_num')])\
        .withColumn('shared', flag_mapping_expr[F.col('shared_request_flag')])\
        .withColumnRenamed('count', 'total_trips')\
        .withColumnRenamed('avg(trip_miles)', 'avg_distance')\
        .select(OUT_COL_NAMES)


In [10]:
def aggregate_trips(df: DataFrame, year:int, month:int, taxi_type: str) -> DataFrame:
    if taxi_type == 'fhvhv':
        return aggregate_trips_fhvhv(df, year, month)
    return aggregate_trips_green_yellow(df, year, month, taxi_type)

In [11]:
aggregate_trips_green_yellow(tlc_yellow_df, 2019, 6, 'yellow').limit(5)

NameError: name 'tlc_yellow_df' is not defined

In [None]:
aggregate_trips_fhvhv(tlc_fhvhv_df, 2019, 6).limit(5)

                                                                                

year,month,type,PUborough,DOborough,shared,total_trips,avg_distance
2019,6,uber,Manhattan,Manhattan,0.0,3875134,2.5829985801779927
2019,6,juno,Staten Island,Brooklyn,0.0,488,11.24405737704918
2019,6,via,Queens,Brooklyn,1.0,7666,7.53632663709886
2019,6,lyft,Brooklyn,Brooklyn,0.0,897851,2.7321963042865622
2019,6,lyft,Manhattan,Bronx,0.0,67456,6.804377875948789


In [12]:
TLC_NAMES = ['green', 'yellow', 'fhvhv']
aggregated_df = None

for name in TLC_NAMES:
    print(f'\nAGGREGATING "{name}" DATA')
    temp_df = None
    for filename in os.listdir(f'../data/raw/tlc/{name}'):
        tlc_df = spark.read.parquet(f'../data/raw/tlc/{name}/{filename}')
        filedata = re.split(r'[-.]', filename)
        if temp_df == None:
            temp_df = aggregate_trips(tlc_df, int(filedata[0]), int(filedata[1]), name)
        else:
            temp_df = temp_df.union(aggregate_trips(tlc_df, int(filedata[0]), int(filedata[1]), name))
    print(temp_df.count())

    if aggregated_df == None:
        aggregated_df = temp_df
    else:
        aggregated_df = aggregated_df.union(temp_df)

print(aggregated_df.count())
aggregated_df.limit(20)

# save the aggregated data
aggregated_df.write.mode('overwrite').parquet('../data/curated/tlc/aggregated')



AGGREGATING "green" DATA


                                                                                

3180

AGGREGATING "yellow" DATA


                                                                                

4284

AGGREGATING "fhvhv" DATA


                                                                                

3626


                                                                                

11090


                                                                                

22/07/31 01:17:51 WARN MemoryManager: Total allocation exceeds 95.00% (956,301,300 bytes) of heap memory
Scaling row group sizes to 89.06% for 8 writers
22/07/31 01:17:51 WARN MemoryManager: Total allocation exceeds 95.00% (956,301,300 bytes) of heap memory
Scaling row group sizes to 79.17% for 9 writers
22/07/31 01:17:51 WARN MemoryManager: Total allocation exceeds 95.00% (956,301,300 bytes) of heap memory
Scaling row group sizes to 71.25% for 10 writers
22/07/31 01:17:51 WARN MemoryManager: Total allocation exceeds 95.00% (956,301,300 bytes) of heap memory
Scaling row group sizes to 64.77% for 11 writers
22/07/31 01:17:51 WARN MemoryManager: Total allocation exceeds 95.00% (956,301,300 bytes) of heap memory
Scaling row group sizes to 59.37% for 12 writers
22/07/31 01:17:51 WARN MemoryManager: Total allocation exceeds 95.00% (956,301,300 bytes) of heap memory
Scaling row group sizes to 54.81% for 13 writers
22/07/31 01:17:51 WARN MemoryManager: Total allocation exceeds 95.00% (956,301

                                                                                