In [1]:
from pyspark.sql import SparkSession, functions as F

# Cell to create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 ASSIGNMENT 1 DUSTIN")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/16 15:43:53 WARN Utils: Your hostname, DESKTOP-3ADPNV0 resolves to a loopback address: 127.0.1.1; using 192.168.154.173 instead (on interface eth0)
22/08/16 15:43:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/16 15:43:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
yellow = spark.read.parquet('../../mast30034-project-1-dustintano10/data/raw/yellow_taxi/')

                                                                                

In [3]:
yellow.show(1, vertical=True, truncate=100)
yellow.printSchema()

[Stage 1:>                                                          (0 + 1) / 1]

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2018-10-01 00:23:34 
 tpep_dropoff_datetime | 2018-10-01 00:44:50 
 passenger_count       | 1.0                 
 trip_distance         | 6.2                 
 RatecodeID            | 1.0                 
 store_and_fwd_flag    | N                   
 PULocationID          | 68                  
 DOLocationID          | 7                   
 payment_type          | 2                   
 fare_amount           | 20.5                
 extra                 | 0.5                 
 mta_tax               | 0.5                 
 tip_amount            | 0.0                 
 tolls_amount          | 0.0                 
 improvement_surcharge | 0.3                 
 total_amount          | 21.8                
 congestion_surcharge  | null                
 airport_fee           | null                
only showing top 1 row

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pic

                                                                                

In [4]:
# create a new column called day_of_week to get day of week for the trip and is_weekend which identify if record is a 
# weekend or not
from pyspark.sql.functions import *
yellow = yellow.withColumn("day_of_week", date_format(col("tpep_pickup_datetime"),"E"))

yellow = yellow.withColumn("is_weekend", col("day_of_week").isin(["Sat", "Sun"]).cast("boolean"))

In [5]:
# separates the date and time from both the pickup and dropoff date time columns
# creates new columns for pickup/dropoff date and pickup/dropoff time

yellow = yellow.withColumn("pickup_date",
                 to_date(col("tpep_pickup_datetime"),"yyyy-MM-dd"))

yellow = yellow.withColumn("dropoff_date",
                 to_date(col("tpep_dropoff_datetime"),"yyyy-MM-dd"))

yellow = yellow.withColumn("pickup_time", date_format('tpep_pickup_datetime', 'HH:mm:ss'))

yellow = yellow.withColumn("dropoff_time", date_format('tpep_dropoff_datetime', 'HH:mm:ss'))

#drop columns that are not important for analysis
yellow = yellow.drop("extra", "mta_tax", "congestion_surcharge", "airport_fee", "tolls_amount", "improvement_surcharge",
                    "passenger_count", "store_and_fwd_flag", )


In [6]:
# filter out all other payment_types as tips are only counted with credit card payment
yellow_credit = yellow.filter(F.col('payment_type') == 1)

# remove records that has trips starting before the month of october
yellow_credit = yellow_credit.filter(F.col('pickup_date') >= '2018-10-01')

# remove other RatecodeID's as they make up such a small amount of the total dataset
yellow_credit = yellow_credit.where( (F.col('RatecodeID') == 1) | (F.col('RatecodeID') == 2))

                                                                                

quantiles_distance,quantiles_fare
"[1.0, 1.69, 3.08]","[6.5, 9.5, 15.0]"


In [35]:
# remove records with negative values and 0 from fare_amount

yellow_credit.where(F.col('fare_amount') > 0)

# remove records where trip distance is 0

yellow_credit.where(F.col('trip_distance') > 0)
                                   



-RECORD 0------------------------------------
 VendorID              | 2                   
 tpep_pickup_datetime  | 2019-06-07 16:19:14 
 tpep_dropoff_datetime | 2019-06-07 16:25:44 
 trip_distance         | 0.66                
 RatecodeID            | 1.0                 
 PULocationID          | 68                  
 DOLocationID          | 90                  
 payment_type          | 1                   
 fare_amount           | -6.0                
 tip_amount            | 0.0                 
 total_amount          | -10.3               
 day_of_week           | Fri                 
 is_weekend            | false               
 pickup_date           | 2019-06-07          
 dropoff_date          | 2019-06-07          
 pickup_time           | 16:19:14            
 dropoff_time          | 16:25:44            
-RECORD 1------------------------------------
 VendorID              | 2                   
 tpep_pickup_datetime  | 2018-12-21 13:15:35 
 tpep_dropoff_datetime | 2018-12-2

                                                                                

In [None]:
# Here we load the curated nba_attendance and convert the Date column into date type
# Then we convert the whole pandas dataframe into a spark dataframe 
import pandas as pd
from pyspark.sql.types import *

nba_attendance = pd.read_csv('../../mast30034-project-1-dustintano10/data/curated/nba_attendance_new.csv')

nba_attendance['Date'] = pd.to_datetime(nba_attendance['Date'], format='%Y%m%d')

schema = StructType([
StructField("Date", DateType(), True),
StructField("Start(ET)", StringType(), True),
StructField("Attend.", StringType(), True),
StructField("Arena", StringType(), True),
StructField("Day_of_week", StringType(), True)
])

nba_attendance_spark = spark.createDataFrame(nba_attendance, schema)

In [None]:
# joins the nba attendance with the yellow_credit dataframe
yellow_credit = yellow_credit.join(nba_attendance_spark, yellow_credit.pickup_date == nba_attendance_spark.Date, 'left')

In [None]:
# saves the new yellow_credit dataframe
yellow_credit.write.mode('overwrite').parquet('../../mast30034-project-1-dustintano10/data/curated/yellow/yellow_credit')