In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

24/08/10 19:54:54 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 10.12.202.130 instead (on interface en0)
24/08/10 19:54:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/10 19:54:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [4]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium

In [5]:
import os
base_dir = '../project/'

In [6]:
data_folders = [
'data/raw/external',
'data/raw/tlc_data/2019',
'data/raw/tlc_data/2021',
'data/curate/tlc_data/first_clean',
'data/curate/tlc_data/final_data'
]

In [7]:
for folder in data_folders:
    path = os.path.join(base_dir, folder)
    if not os.path.exists(path):
        os.makedirs(path)
        print(f'Created folder: {path}')
# 创建 notebooks 文件夹
notebooks_folder = os.path.join(base_dir, 'notebooks')
if not os.path.exists(notebooks_folder):
    os.makedirs(notebooks_folder)
    print(f'Created folder: {notebooks_folder}')

In [8]:
import requests
import os

URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"
YEAR = ['2019', '2021']
MONTH = range(1, 13)
output_relative_dir = '../project/data/raw/tlc_data/'

for year in YEAR:
    for month in MONTH:
        print(f'Starting download for {year}-{str(month).zfill(2)}')
        month_str = str(month).zfill(2)
        url = f'{URL_TEMPLATE}{year}-{month_str}.parquet'
        output_dir = f"{output_relative_dir}/{year}/{year}-{month_str}.parquet"
        
        response = requests.get(url, verify=True)
        with open(output_dir, 'wb') as file:
            file.write(response.content)
            
        print(f'Finished download for {year}-{month_str}')


Starting download for 2019-01
Finished download for 2019-01
Starting download for 2019-02
Finished download for 2019-02
Starting download for 2019-03
Finished download for 2019-03
Starting download for 2019-04
Finished download for 2019-04
Starting download for 2019-05
Finished download for 2019-05
Starting download for 2019-06
Finished download for 2019-06
Starting download for 2019-07


KeyboardInterrupt: 

In [9]:
# 读取2019年的Parquet文件
df_2019 = spark.read.parquet('../project/data/raw/tlc_data/2019/*.parquet')
# 读取2021年的Parquet文件
df_2021 = spark.read.parquet('../project/data/raw/tlc_data/2021/*.parquet') 

                                                                                

In [11]:
path = "../project/data/raw/tlc_data/"
df_2019_1 = spark.read.parquet(path + "/2019/2019-01.parquet") 
df_2021_1 = spark.read.parquet(path + "/2021/2021-01.parquet")

In [14]:
# 显示数据 
df_2019_1.show(10) 
df_2021_1.show(10)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2019-01-01 00:46:40|  2019-01-01 00:53:20|            1.0|          1.5|       1.0|                 N|         151|         239|           1|        7.0|  0.5|    0.5|      1.6

In [16]:
df_2019_1.printSchema() 
df_2021_1.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: integer (nullable = true)

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime:

In [17]:
# 过滤数据
df_2019_filtered = df_2019_1.filter(df_2019_1["trip_distance"] > 10) 
df_2019_filtered.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2019-01-01 00:19:55|  2019-01-01 00:57:56|            1.0|         12.3|       1.0|                 N|         138|          50|           1|       38.0|  0.5|    0.5|       4.

In [18]:
df_2019_filtered.select(F.col('passenger_count') > 3).limit(10)

(passenger_count > 3)
False
False
False
False
True
False
False
True
False
False


In [19]:
df_2019_filtered.where(F.col('passenger_count') > 3).limit(10)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
1,2019-01-01 00:04:06,2019-01-01 00:38:55,4.0,18.1,2.0,N,132,224,1,52.0,0.0,0.5,5.0,0.0,0.3,57.8,,
2,2019-01-01 00:11:27,2019-01-01 00:46:29,5.0,21.42,2.0,N,132,87,1,52.0,0.0,0.5,11.71,5.76,0.3,70.27,,
2,2019-01-01 00:51:30,2019-01-01 01:26:34,6.0,16.71,1.0,N,262,16,1,47.0,0.5,0.5,0.0,0.0,0.3,48.3,,
1,2019-01-01 00:36:29,2019-01-01 01:22:12,4.0,12.0,1.0,N,264,264,2,40.5,0.0,0.5,0.0,0.0,0.3,41.3,,
1,2019-01-01 00:20:16,2019-01-01 00:57:35,4.0,16.6,1.0,N,132,133,1,48.0,0.5,0.5,7.0,0.0,0.3,56.3,,
2,2019-01-01 00:50:03,2019-01-01 01:19:35,5.0,20.11,1.0,N,132,51,2,53.5,0.5,0.5,0.0,5.76,0.3,60.56,,
2,2019-01-01 00:28:07,2019-01-01 00:53:21,5.0,10.65,1.0,N,79,243,2,31.5,0.5,0.5,0.0,0.0,0.3,32.8,,
2,2019-01-01 00:45:48,2019-01-01 01:32:26,6.0,14.25,1.0,N,186,123,1,46.5,0.5,0.5,9.56,0.0,0.3,57.36,,
2,2019-01-01 00:49:11,2019-01-01 01:39:31,4.0,13.4,1.0,N,229,165,1,46.0,0.5,0.5,9.46,0.0,0.3,56.76,,
2,2019-01-01 00:40:48,2019-01-01 01:21:23,5.0,18.54,1.0,N,25,200,1,52.5,0.5,0.5,0.0,0.0,0.3,53.8,,


In [21]:
# 合并2019年和2021年的数据
df_all = df_2019_1.union(df_2021_1) 
df_all.show(10)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2019-01-01 00:46:40|  2019-01-01 00:53:20|            1.0|          1.5|       1.0|                 N|         151|         239|           1|        7.0|  0.5|    0.5|      1.6

## Add new column ##

In [27]:
df_renamed = df_all.withColumnRenamed("RatecodeID", "Rate_codeID") 
df_renamed.show(5)

+--------+--------------------+---------------------+---------------+-------------+-----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|Rate_codeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+-----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2019-01-01 00:46:40|  2019-01-01 00:53:20|            1.0|          1.5|        1.0|                 N|         151|         239|           1|        7.0|  0.5|    0.5|     

In [28]:
df_dropped = df_all.drop("passenger_count_plus_10")
df_dropped.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2019-01-01 00:46:40|  2019-01-01 00:53:20|            1.0|          1.5|       1.0|                 N|         151|         239|           1|        7.0|  0.5|    0.5|      1.6

In [29]:
df_all.groupBy("VendorID").agg({"passenger_count": "avg", "extra": "max"}).show()



+--------+----------+--------------------+
|VendorID|max(extra)|avg(passenger_count)|
+--------+----------+--------------------+
|       5|       0.0|                NULL|
|       1|    535.38|   1.192631892580005|
|       2|      8.25|  1.7661998605113918|
|       4|      18.5|  1.0244848547960896|
|       6|      6.69|                NULL|
+--------+----------+--------------------+



                                                                                

## Sampling Data

In [30]:
SAMPLE_SIZE = 0.01

In [31]:
df = df_all.sample(SAMPLE_SIZE, seed = 20020223).toPandas() 
df



CodeCache: size=131072Kb used=35360Kb max_used=35395Kb free=95711Kb
 bounds [0x0000000105fe0000, 0x0000000108320000, 0x000000010dfe0000]
 total_blobs=12835 nmethods=11818 adapters=928
 compilation: disabled (not enough contiguous free space left)


                                                                                

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2018-12-21 13:48:30,2018-12-21 13:52:40,3.0,0.00,1.0,N,236,236,1,4.50,0.50,0.5,0.00,0.0,0.3,5.80,,
1,2,2019-01-01 00:17:31,2019-01-01 00:24:23,2.0,0.95,1.0,N,246,68,1,6.50,0.50,0.5,0.00,0.0,0.3,7.80,,
2,2,2019-01-01 00:01:07,2019-01-01 00:10:50,1.0,1.93,1.0,N,137,229,1,8.50,0.50,0.5,1.96,0.0,0.3,11.76,,
3,1,2019-01-01 00:58:45,2019-01-01 01:12:23,2.0,2.60,1.0,N,151,142,1,12.00,0.50,0.5,2.65,0.0,0.3,15.95,,
4,1,2019-01-01 00:21:03,2019-01-01 00:26:27,1.0,0.20,1.0,N,90,68,1,5.00,0.50,0.5,1.00,0.0,0.3,7.30,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90595,2,2021-01-31 14:10:00,2021-01-31 14:25:00,,2.45,,,18,254,0,21.95,2.75,0.5,0.00,0.0,0.3,25.50,,
90596,2,2021-01-31 14:24:08,2021-01-31 14:40:13,,6.83,,,81,213,0,28.52,2.75,0.5,0.00,0.0,0.3,32.07,,
90597,1,2021-01-31 15:45:52,2021-01-31 16:00:53,,0.00,,,90,236,0,11.70,0.00,0.5,2.25,0.0,0.3,19.25,,
90598,2,2021-01-31 18:05:00,2021-01-31 18:38:00,,5.89,,,181,91,0,28.95,2.75,0.5,0.00,0.0,0.3,32.50,,


## Pre-Processing

In [36]:
df_2019_1.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2019-01-01 00:46:40|  2019-01-01 00:53:20|            1.0|          1.5|       1.0|                 N|         151|         239|           1|        7.0|  0.5|    0.5|      1.6

In [49]:
df_2019_1.printSchema() 
print(f"数据总量: {df_2019_1.count()}") 
df_2019_1.describe().show()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = false)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: integer (nullable = true)



                                                                                

数据总量: 0




+-------+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|summary|VendorID|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+-------+--------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|  count|       0|              0|            0|         0|                 0|           0|           0|           0|          0|    0|      0|         0|           0|                    0|           0|                   0|          0|
|   mean|    NULL|           NULL|         NULL|      NU

                                                                                

## handling missing data 

In [37]:
df_2019_1 = df_2019_1.dropna()
df_2019_1 = df_2019_1.fillna({'passenger_count': 1, 'trip_distance': 0.0})

## Datatype Conversion

In [40]:
df_2019_1 = df_2019_1.withColumn("passenger_count", col("passenger_count").cast("integer"))
df_2019_1 = df_2019_1.withColumn("tpep_pickup_datetime", col("tpep_pickup_datetime")) 
df_2019_1 = df_2019_1.withColumn("tpep_dropoff_datetime", col("tpep_dropoff_datetime"))  

## Handling duplicates

In [41]:
df_2019_1 = df_2019_1.dropDuplicates()

## Anomaly 

In [47]:
trip_distance_min = 0
trip_distance_max = 100
df_2019_1 = df_2019_1.filter((df_2019_1["trip_distance"] > trip_distance_min) & (df_2019_1["trip_distance"] > trip_distance_max))

## Standardization 

In [48]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
assembler = VectorAssembler(inputCols=["trip_distance", "trip_duration"], outputCol= "features")
df_features = assembler.transform(df_2019_1)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features") 
scaler_model = scaler.fit(df_features)
df_scaled = scaler_model.transform(df_features)

IllegalArgumentException: trip_duration does not exist. Available: VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee