In [1]:
from google.cloud import storage
import pandas as pd
import os
from io import BytesIO

# Define a data cleaning function
def data_cleaning(df):
    # Drop columns with mostly missing data and ununique data
    df = df.drop(['originating_base_num', 'on_scene_datetime', 'access_a_ride_flag', 'wav_match_flag','airport_fee'], axis=1, errors='ignore')
    
    # Remove rows with driver pay outliers
    df = df[(df['driver_pay'] > 0) & (df['driver_pay'] < 600)]
    
    # Remove rows with outlier passenger fare
    df = df[(df['base_passenger_fare'] > 0) & (df['base_passenger_fare'] < 800)]
    
    # Remove rows with bad or zero trip miles
    df = df[(df['trip_miles'] > 0) & (df['trip_miles'] < 150)]
    
    # Filling Na with zero
    df = df.fillna(0)
    
    # Convert specific columns to strings and float
    df['hvfhs_license_num'] = df['hvfhs_license_num'].astype(str)
    df['dispatching_base_num'] = df['dispatching_base_num'].astype(str)
    df['trip_miles'] = df['trip_miles'].astype(float)
    
    return df


# Google Cloud Storage bucket name
source_bucket_name = "XXX  

# Initialize the Google Cloud Storage client
storage_client = storage.Client()
# Define the folder pattern for the source files
folder_pattern = "landing/"

# List blobs in the source folder
blobs = storage_client.list_blobs(source_bucket_name, prefix=folder_pattern)

# Filter for .parquet files only
filtered_blobs = [blob for blob in blobs if blob.name.endswith('.parquet')]

# Iterate through the list of files
for blob in filtered_blobs:
    print(f"Processing file: {blob.name} with size {blob.size} bytes")
    
    # Download the Parquet file as bytes and load into a DataFrame
    df = pd.read_parquet(BytesIO(blob.download_as_bytes()), engine='pyarrow')
    
    # Clean the data
    df = data_cleaning(df)
    
    # Define the path for the cleaned file
    filename = os.path.basename(blob.name)
    cleaned_file_path = f"Cleaned_November_13/{filename}_cleaned.parquet"
    
    # Create a BytesIO buffer to hold the cleaned DataFrame as parquet data
    parquet_buffer = BytesIO()
    df.to_parquet(parquet_buffer, index=False, engine='pyarrow')
    parquet_buffer.seek(0)  # Reset the buffer position
    
    # Upload the cleaned data to the specified path in GCS
    cleaned_blob = storage_client.bucket(source_bucket_name).blob(cleaned_file_path)
    cleaned_blob.upload_from_file(parquet_buffer, content_type='application/octet-stream')

    print(f"Cleaned file saved to: {cleaned_file_path}")


Processing file: landing/fhvhv_tripdata_2019-02.parquet with size 513054623 bytes
Cleaned file saved to: Cleaned_November_13/fhvhv_tripdata_2019-02.parquet_cleaned.parquet
Processing file: landing/fhvhv_tripdata_2019-03.parquet with size 610854382 bytes
Cleaned file saved to: Cleaned_November_13/fhvhv_tripdata_2019-03.parquet_cleaned.parquet
Processing file: landing/fhvhv_tripdata_2019-04.parquet with size 559656651 bytes
Cleaned file saved to: Cleaned_November_13/fhvhv_tripdata_2019-04.parquet_cleaned.parquet
Processing file: landing/fhvhv_tripdata_2019-05.parquet with size 570821186 bytes
Cleaned file saved to: Cleaned_November_13/fhvhv_tripdata_2019-05.parquet_cleaned.parquet
Processing file: landing/fhvhv_tripdata_2019-06.parquet with size 536191926 bytes
Cleaned file saved to: Cleaned_November_13/fhvhv_tripdata_2019-06.parquet_cleaned.parquet
Processing file: landing/fhvhv_tripdata_2019-07.parquet with size 516008365 bytes
Cleaned file saved to: Cleaned_November_13/fhvhv_tripdata_

In [5]:
!pip install gcsfs

import pandas as pd

# Read the Parquet file directly from GCS into a DataFrame
df = pd.read_parquet('XXX/fhvhv_tripdata_2019-02.parquet_cleaned.parquet', engine='pyarrow')

# Now, you can perform your analysis on the DataFrame 'df'
df.head()


[0m

Unnamed: 0,hvfhs_license_num,dispatching_base_num,request_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B02867,2019-02-01 00:01:26,2019-02-01 00:05:18,2019-02-01 00:14:57,245,251,2.45,579,9.35,...,0.83,0.0,,0.0,7.48,Y,N,N,N,
1,HV0003,B02879,2019-02-01 00:26:08,2019-02-01 00:41:29,2019-02-01 00:49:39,216,197,1.71,490,7.91,...,0.7,0.0,,2.0,7.93,N,N,N,N,
2,HV0005,B02510,2019-02-01 00:48:58,2019-02-01 00:51:34,2019-02-01 01:28:29,261,234,5.01,2159,44.96,...,3.99,0.0,,0.0,35.97,N,Y,N,N,
3,HV0005,B02510,2019-02-01 00:02:15,2019-02-01 00:03:51,2019-02-01 00:07:16,87,87,0.34,179,7.19,...,0.64,0.0,,3.0,5.39,N,Y,N,N,
4,HV0005,B02510,2019-02-01 00:06:17,2019-02-01 00:09:44,2019-02-01 00:39:56,87,198,6.84,1799,24.25,...,2.16,0.0,,4.0,17.07,N,Y,N,N,


In [6]:
na_counts = df.isna().sum()

# Display the counts of NaN values per column
print(na_counts)


hvfhs_license_num              0
dispatching_base_num         403
request_datetime            2897
pickup_datetime                0
dropoff_datetime               0
PULocationID                   0
DOLocationID                   0
trip_miles                     0
trip_time                      0
base_passenger_fare            0
tolls                          0
bcf                            0
sales_tax                      0
congestion_surcharge      511730
airport_fee             17889647
tips                           0
driver_pay                     0
shared_request_flag            0
shared_match_flag              0
access_a_ride_flag             0
wav_request_flag               0
wav_match_flag          17889647
dtype: int64


In [8]:
pd.options.display.float_format = '{:.2f}'.format
df.describe()

Unnamed: 0,request_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,tips,driver_pay
count,17886750,17889647,17889647,17889647.0,17889647.0,17889647.0,17889647.0,17889647.0,17889647.0,17889647.0,17889647.0,17377917.0,17889647.0,17889647.0
mean,2019-02-15 02:14:11.231677,2019-02-15 02:16:10.249091,2019-02-15 02:34:51.736528,138.77,141.64,4.76,1115.5,16.51,0.84,0.43,1.38,1.04,0.51,15.49
min,2019-01-31 23:19:44,2019-02-01 00:00:00,2019-02-01 00:02:09,1.0,1.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01
25%,2019-02-08 08:12:08,2019-02-08 08:15:00,2019-02-08 08:35:22,75.0,76.0,1.56,575.0,7.13,0.0,0.18,0.6,0.0,0.0,6.77
50%,2019-02-15 02:03:55,2019-02-15 02:01:07,2019-02-15 02:17:14,140.0,141.0,2.92,922.0,11.41,0.0,0.28,0.97,0.0,0.0,10.89
75%,2019-02-22 11:40:38,2019-02-22 11:43:36,2019-02-22 12:02:43,211.0,217.0,5.83,1448.0,19.96,0.0,0.5,1.7,2.75,0.0,18.34
max,2019-02-28 23:58:52,2019-02-28 23:59:59,2019-03-01 02:38:12,265.0,265.0,149.94,67272.0,796.6,171.08,20.23,71.21,2.75,400.0,580.15
std,,,,75.1,77.51,5.48,768.95,16.5,3.32,0.46,1.32,1.3,1.87,14.55
