### Importing pandas and creating DataFrame

Green taxi full data size (2014-2024) is around 1,5 GB only, so pandas can be used to format the data in the same way as yellow taxi is formatted

In [1]:
import numpy as np
import pandas as pd

In [5]:
import sys
!{sys.executable} -m pip install pandas



Loading each year of green taxi and checking column names/formats against sample file

In [2]:
df = pd.read_parquet('data/taxi/green_taxi/')

In [133]:
df_sample = pd.read_parquet('data/taxi/sample/sample_yellow.parquet')

In [4]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge
0,2,2014-01-01 00:17:26,2014-01-01 00:37:11,True,1,17,225,1,2.28,13.5,0.5,0.5,0.0,0.0,,14.5,2,
1,1,2014-01-01 00:29:12,2014-01-01 00:37:43,True,1,127,241,1,2.1,9.0,0.0,0.5,0.0,0.0,,9.5,2,
2,2,2014-01-01 00:31:35,2014-01-01 00:44:09,True,1,166,243,1,4.72,15.5,0.5,0.5,4.0,0.0,,20.5,1,
3,2,2014-01-01 00:07:01,2014-01-01 00:21:54,True,1,7,157,1,2.88,13.0,0.5,0.5,2.88,0.0,,16.879999,1,
4,2,2014-01-01 00:26:43,2014-01-01 00:37:17,True,1,83,197,2,3.8,13.0,0.5,0.5,0.0,0.0,,14.0,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83484683,2,2024-12-31 22:24:00,2024-12-31 22:52:00,False,,7,148,,8.96,41.27,0.0,0.5,0.0,0.0,1.0,45.52,,
83484684,2,2024-12-31 23:42:09,2025-01-01 00:07:20,False,,256,107,,6.27,29.299999,0.0,0.5,3.24,6.94,1.0,43.73,,
83484685,2,2024-12-31 23:23:00,2024-12-31 23:37:00,False,,42,140,,4.82,22.74,0.0,0.5,5.4,0.0,1.0,32.389999,,
83484686,2,2024-12-31 23:18:00,2024-12-31 23:27:00,False,,74,262,,2.14,14.18,0.0,0.5,1.84,0.0,1.0,20.27,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83484688 entries, 0 to 83484687
Data columns (total 18 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               Int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   store_and_fwd_flag     bool          
 4   RatecodeID             Int32         
 5   PULocationID           Int32         
 6   DOLocationID           Int32         
 7   passenger_count        Int32         
 8   trip_distance          Float32       
 9   fare_amount            Float32       
 10  extra                  Float32       
 11  mta_tax                Float32       
 12  tip_amount             Float32       
 13  tolls_amount           Float32       
 14  improvement_surcharge  Float32       
 15  total_amount           Float32       
 16  payment_type           Int32         
 17  congestion_surcharge   Float32       
dtypes: Float32(9), Int32

This function checks the differences between df and df_sample (the same as for yellow_taxi), ignoring capital letters in data format - it can be unified in the next step

In [188]:
def porownaj_schemat(df: pd.DataFrame, df_sample: pd.DataFrame):
    kol_df = set(df.columns)
    kol_sample = set(df_sample.columns)
  
    brakujace_w_df_sample = sorted(kol_df - kol_sample)
    brakujace_w_df = sorted(kol_sample - kol_df)
  
    rozne_typy = []
    for col in sorted(kol_df.intersection(kol_sample)):
        typ_df = str(df[col].dtype)
        typ_sample = str(df_sample[col].dtype)
       
        if typ_df.lower() != typ_sample.lower():
            linia = f"Kolumna '{col}': {typ_df} (df) → {typ_sample} (df_sample)"
            rozne_typy.append(linia)
  
    czy_identyczny = (len(brakujace_w_df_sample) == 0 and
                      len(brakujace_w_df) == 0 and
                      len(rozne_typy) == 0)
  
    return czy_identyczny, brakujace_w_df_sample, brakujace_w_df, rozne_typy

identyczny, brak_w_sample, brak_w_df, rozne_typy = porownaj_schemat(df, df_sample)
print("Schemat identyczny:", identyczny)
print("Brakujące w df_sample:", brak_w_sample or "brak")
print("Brakujące w df:", brak_w_df or "brak")
if rozne_typy:
    print("Różnice w typach danych:")
    for linia in rozne_typy:
        print(linia)
else:
    print("Różnice w typach danych: brak")

Schemat identyczny: False
Brakujące w df_sample: brak
Brakujące w df: ['airport_fee']
Różnice w typach danych: brak


Renaming and formatting data

In [187]:
df = df.rename(columns={
    'lpep_dropoff_datetime': 'tpep_dropoff_datetime',
    'lpep_pickup_datetime': 'tpep_pickup_datetime'
})
df = df.drop(['ehail_fee', 'trip_type'], axis=1)

df = df.astype({
    "VendorID": 'Int32',
    "tpep_pickup_datetime": 'datetime64[us]',
    "tpep_pickup_datetime": 'datetime64[us]',
    "passenger_count": 'Int32',
    "trip_distance": 'Float32',
    "RatecodeID": 'Int32',
    "store_and_fwd_flag": 'bool',
    "PULocationID": 'Int32',
    "DOLocationID": 'Int32',
    "payment_type": 'Int32',
    "fare_amount": 'Float32',
    "extra": 'Float32',
    "mta_tax": 'Float32',
    "tip_amount": 'Float32',
    "tolls_amount": 'Float32',
    "improvement_surcharge": 'Float32',
    "total_amount": 'Float32',
    'congestion_surcharge': 'Float32'

})

Saving to parquet

In [189]:
df.to_parquet("data/taxi/green_taxi/2014/green_tripdata_2014.parquet")

#### For analytics please check '05_analytics' notebook