In [1]:
from pyspark.sql import SparkSession, functions as F

# Cell to create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 ASSIGNMENT 1 DUSTIN")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/24 19:41:39 WARN Utils: Your hostname, DESKTOP-3ADPNV0 resolves to a loopback address: 127.0.1.1; using 172.25.24.22 instead (on interface eth0)
22/08/24 19:41:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/24 19:41:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
yellow = spark.read.parquet('../../mast30034-project-1-dustintano10/data/raw/yellow_taxi/')

                                                                                

In [22]:
# number of rows before preprocessing

yellow.count()

54251562

In [4]:
# create a new column called day_of_week to get day of week for the trip and is_weekend which identify if record is a 
# weekend or not
# drop day_of_week since we only want to know if its weekend or not

from pyspark.sql.functions import *
yellow = yellow.withColumn("day_of_week", date_format(col("tpep_pickup_datetime"),"E"))

yellow = yellow.withColumn("is_weekend", col("day_of_week").isin(["Sat", "Sun"]).cast("boolean"))

yellow = yellow.drop("day_of_week")

In [5]:
# separates the date and time from both the pickup and dropoff date time columns
# creates new columns for pickup/dropoff date

yellow = yellow.withColumn("pickup_date",
                 to_date(col("tpep_pickup_datetime"),"yyyy-MM-dd"))

yellow = yellow.withColumn("dropoff_date",
                 to_date(col("tpep_dropoff_datetime"),"yyyy-MM-dd"))

# ensure passenger_count is not 0
yellow = yellow.where( (F.col('passenger_count') > 0) )

#drop columns that are not important for analysis
yellow = yellow.drop("extra", "mta_tax", "congestion_surcharge", "airport_fee", "improvement_surcharge",
                    "passenger_count", "store_and_fwd_flag", )


In [6]:
# filter out all other payment_types as tips are only counted with credit card payment
yellow_credit = yellow.filter(F.col('payment_type') == 1)

# remove records that has trips starting before the month of october
yellow_credit = yellow_credit.filter(F.col('pickup_date') >= '2018-10-01')

# remove other RatecodeID's as they make up such a small amount of the total dataset
yellow_credit = yellow_credit.where( (F.col('RatecodeID') == 1) | (F.col('RatecodeID') == 2))

# remove VendorID not being 1 or 2
yellow_credit = yellow_credit.where( (F.col('VendorID') > 0 ) & (F.col('VendorID') < 3))



In [7]:
# remove records which don't follow the initial amount of 2.5 from fare_amount

yellow_credit = yellow_credit.where(F.col('fare_amount') >= 2.5)

# remove records where trip distance is 0

yellow_credit = yellow_credit.where(F.col('trip_distance') > 0)

# remove records where the PU and DO location is not in the range

yellow_credit = yellow_credit.where( ( F.col('PULocationID') < 264 ) & (F.col('DOLocationID') < 264) & 
                                    ( F.col('PULocationID') > 0) & (F.col('DOLocationID') > 0))

In [8]:
# create a length of trip column in mins

yellow_credit = yellow_credit.withColumn('trip_length', 
                         round((unix_timestamp('tpep_dropoff_datetime') - unix_timestamp('tpep_pickup_datetime'))/60, 4))

# filters out trips that are negative and less than 2 minutes in time length
yellow_credit = yellow_credit.where( (F.col('trip_length') > 2 ) )


In [14]:
# removed outliers for fare_amount

yellow_credit.select( percentile_approx("fare_amount", [0.25, 0.75], 10000).alias("quantiles_fare") )


upper_q_fare = 13.0
lower_q_fare = 6.5

IQ_fare = upper_q_fare-lower_q_fare

borderline_upper_fare = upper_q_fare + (1.5 * IQ_fare)
borderline_lower_fare = lower_q_fare - (1.5 * IQ_fare)

yellow_credit = yellow_credit.where( (F.col('fare_amount') <= borderline_upper_fare ) & 
                                      (F.col('fare_amount') >= borderline_lower_fare) )




In [16]:
# Here we load the curated nba_attendance and convert the Date column into date type
# Then we convert the whole pandas dataframe into a spark dataframe 
import pandas as pd
from pyspark.sql.types import *

nba_attendance = pd.read_csv('../../mast30034-project-1-dustintano10/data/curated/nba_attendance_new.csv')

nba_attendance['Date'] = pd.to_datetime(nba_attendance['Date'], format='%Y%m%d')

schema = StructType([
StructField("Date", DateType(), True),
StructField("Start(ET)", StringType(), True),
StructField("Attendance", IntegerType(), True),
StructField("Win", StringType(), True),
StructField("margin_victory/loss", IntegerType(), True),
])

nba_attendance_spark = spark.createDataFrame(nba_attendance, schema)

In [17]:
# joins the nba attendance with the yellow_credit dataframe
yellow_credit = yellow_credit.join(nba_attendance_spark, yellow_credit.pickup_date == nba_attendance_spark.Date, 'left')

In [18]:
# filters the data to only be records where the knicks are playing
yellow_credit = yellow_credit.filter( F.col('Date').isNotNull() )

# filter records where pickup_date and dropoff_date is not the same as we are focusing just on days the knicks are playing
# they also only make a small percentage of the data

yellow_credit = yellow_credit.where((F.col("pickup_date") == F.col("dropoff_date")))


In [19]:
#drop the Date column as it is a duplicate
yellow_credit = yellow_credit.drop("VendorID", "payment_type", "RatecodeID", "pickup_date", "dropoff_date")

In [21]:
# number of rows after preprocessing
yellow_credit.count()

                                                                                

6165834

In [20]:
# saves the new yellow_credit dataframe
yellow_credit.write.mode('overwrite').parquet('../../mast30034-project-1-dustintano10/data/curated/yellow/yellow_credit')

                                                                                