In [1]:
import pyspark
from pyspark.sql import SparkSession 
from pyspark.sql import types

import pandas as pd
from fastparquet import write, ParquetFile

## Download and Prepare the Data  
You probably want to uncomment some lines in order to execute code :)

**File size:**  
.parquet ~309 MB  
.csv ~2 GB

In [2]:
# Download Parquet Data

# !wget https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2021-01.parquet

In [3]:
# Convert parquet to csv (for tutorial purpose)
# I had to do this, because NYC site with data removed .csv file :(

# df_pandas = pd.read_parquet('fhvhv_tripdata_2021-01.parquet')
# df_pandas.to_csv('fhvhv_tripdata_2021-01.csv')

In [4]:
# Remove first column of csv file using pandas (it's just an index without header-title):

# df_cut = pd.read_csv('fhvhv_tripdata_2021-01.csv')
# df_cut.head()

In [5]:
# Delete first and rewrite the .csv file

# df_cut = df_cut.drop(['Unnamed: 0'], axis=1)
#df_cut.to_csv('fhvhv_tripdata_2021-01.csv', index=False)

## Start Spark Session and Read Data

In [6]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Watson") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/04 09:37:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
# Read Parquet file
df_from_parquet = spark.read \
    .option("header", "true") \
    .parquet('fhvhv_tripdata_2021-01.parquet')

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [8]:
# Read CSV file
df_from_csv = spark.read \
    .option("header", "true") \
    .csv('fhvhv_tripdata_2021-01.csv')

In [9]:
# Show Data
df_from_csv.show()

+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

23/05/04 09:37:29 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [10]:
# Return first five elements
df_from_csv.head(5)

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', originating_base_num='B02682', request_datetime='2021-01-01 00:28:09', on_scene_datetime='2021-01-01 00:31:42', pickup_datetime='2021-01-01 00:33:44', dropoff_datetime='2021-01-01 00:49:07', PULocationID='230', DOLocationID='166', trip_miles='5.26', trip_time='923', base_passenger_fare='22.28', tolls='0.0', bcf='0.67', sales_tax='1.98', congestion_surcharge='2.75', airport_fee=None, tips='0.0', driver_pay='14.99', shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N'),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', originating_base_num='B02682', request_datetime='2021-01-01 00:45:56', on_scene_datetime='2021-01-01 00:55:19', pickup_datetime='2021-01-01 00:55:19', dropoff_datetime='2021-01-01 01:18:21', PULocationID='152', DOLocationID='167', trip_miles='3.65', trip_time='1382', base_passenger_fare='18.36', tolls='0.0', bcf='0.55', sales_tax='1.63',

In [11]:
# Here we can see that all the types from .csv are StringType
df_from_csv.schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('originating_base_num', StringType(), True), StructField('request_datetime', StringType(), True), StructField('on_scene_datetime', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', StringType(), True), StructField('DOLocationID', StringType(), True), StructField('trip_miles', StringType(), True), StructField('trip_time', StringType(), True), StructField('base_passenger_fare', StringType(), True), StructField('tolls', StringType(), True), StructField('bcf', StringType(), True), StructField('sales_tax', StringType(), True), StructField('congestion_surcharge', StringType(), True), StructField('airport_fee', StringType(), True), StructField('tips', StringType(), True), StructField('driver_pay', StringType(), True), StructField('shared_request_flag', Strin

## Creating a schema with Pandas

In [12]:
# Let's use only 100 rows of data from .csv to fix the schema using pandas
# Save 101 row to a new .csv file (1 is for header)
!head -n 101 fhvhv_tripdata_2021-01.csv > head.csv

In [13]:
# Show number of rows
!wc -l head.csv

     101 head.csv


In [14]:
# Make a pandas dataframe from the new .csv file
df_pandas = pd.read_csv('head.csv')

In [15]:
# Show data types of the attributes
df_pandas.dtypes

hvfhs_license_num        object
dispatching_base_num     object
originating_base_num     object
request_datetime         object
on_scene_datetime        object
pickup_datetime          object
dropoff_datetime         object
PULocationID              int64
DOLocationID              int64
trip_miles              float64
trip_time                 int64
base_passenger_fare     float64
tolls                   float64
bcf                     float64
sales_tax               float64
congestion_surcharge    float64
airport_fee             float64
tips                    float64
driver_pay              float64
shared_request_flag      object
shared_match_flag        object
access_a_ride_flag       object
wav_request_flag         object
wav_match_flag           object
dtype: object

In [16]:
# Create a spark dataframe from pandas and show the schema
# We can see that the types were fixed
spark.createDataFrame(df_pandas).schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('originating_base_num', StringType(), True), StructField('request_datetime', StringType(), True), StructField('on_scene_datetime', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('trip_miles', DoubleType(), True), StructField('trip_time', LongType(), True), StructField('base_passenger_fare', DoubleType(), True), StructField('tolls', DoubleType(), True), StructField('bcf', DoubleType(), True), StructField('sales_tax', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('airport_fee', DoubleType(), True), StructField('tips', DoubleType(), True), StructField('driver_pay', DoubleType(), True), StructField('shared_request_flag', StringType(

In [17]:
# Let's change LongType to IntegerType and create our custom schema
schema = types.StructType([ 
    types.StructField('hvfhs_license_num', types.StringType(), True), 
    types.StructField('dispatching_base_num', types.StringType(), True), 
    types.StructField('originating_base_num', types.StringType(), True), 
    types.StructField('request_datetime', types.StringType(), True),
    types.StructField('on_scene_datetime', types.StringType(), True),
    types.StructField('pickup_datetime', types.StringType(), True), 
    types.StructField('dropoff_datetime', types.StringType(), True), 
    types.StructField('PULocationID', types.IntegerType(), True), 
    types.StructField('DOLocationID', types.IntegerType(), True), 
    types.StructField('trip_miles', types.DoubleType(), True), 
    types.StructField('trip_time', types.IntegerType(), True), 
    types.StructField('base_passenger_fare', types.DoubleType(), True), 
    types.StructField('tolls', types.DoubleType(), True), 
    types.StructField('bcf', types.DoubleType(), True), 
    types.StructField('sales_tax', types.DoubleType(), True), 
    types.StructField('congestion_surcharge', types.DoubleType(), True), 
    types.StructField('airport_fee', types.DoubleType(), True), 
    types.StructField('tips', types.DoubleType(), True), 
    types.StructField('driver_pay', types.DoubleType(), True), 
    types.StructField('shared_request_flag', types.StringType(), True), 
    types.StructField('shared_match_flag', types.StringType(), True), 
    types.StructField('access_a_ride_flag', types.StringType(), True), 
    types.StructField('wav_request_flag', types.StringType(), True), 
    types.StructField('wav_match_flag', types.StringType(), True)
])

In [18]:
# Create spark df with our custom schema, and load the full .csv dataset
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('fhvhv_tripdata_2021-01.csv')

In [19]:
df.head(10)

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', originating_base_num='B02682', request_datetime='2021-01-01 00:28:09', on_scene_datetime='2021-01-01 00:31:42', pickup_datetime='2021-01-01 00:33:44', dropoff_datetime='2021-01-01 00:49:07', PULocationID=230, DOLocationID=166, trip_miles=5.26, trip_time=923, base_passenger_fare=22.28, tolls=0.0, bcf=0.67, sales_tax=1.98, congestion_surcharge=2.75, airport_fee=None, tips=0.0, driver_pay=14.99, shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N'),
 Row(hvfhs_license_num='HV0003', dispatching_base_num='B02682', originating_base_num='B02682', request_datetime='2021-01-01 00:45:56', on_scene_datetime='2021-01-01 00:55:19', pickup_datetime='2021-01-01 00:55:19', dropoff_datetime='2021-01-01 01:18:21', PULocationID=152, DOLocationID=167, trip_miles=3.65, trip_time=1382, base_passenger_fare=18.36, tolls=0.0, bcf=0.55, sales_tax=1.63, congestion_surcharge=0.0, airport_fee

In [None]:
spark.stop()