In [None]:
from datetime import datetime
import numpy as np
import pandas as pd
import geopandas as gpd
from geopy.geocoders import Nominatim
from shapely.geometry import Point
from shapely import wkb
from shapely import errors
import itertools
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [None]:
"""
jan_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet').sample(frac=0.035, random_state=1)
feb_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-02.parquet').sample(frac=0.035, random_state=1)
mar_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-03.parquet').sample(frac=0.035, random_state=1)
apr_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-04.parquet').sample(frac=0.035, random_state=1)
may_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-05.parquet').sample(frac=0.035, random_state=1)
jun_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-06.parquet').sample(frac=0.035, random_state=1)
jul_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-07.parquet').sample(frac=0.035, random_state=1)
aug_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-08.parquet').sample(frac=0.035, random_state=1)
sep_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-09.parquet').sample(frac=0.035, random_state=1)
oct_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-10.parquet').sample(frac=0.035, random_state=1)
nov_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-11.parquet').sample(frac=0.035, random_state=1)
dec_app_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-12.parquet').sample(frac=0.035, random_state=1)
"""

In [None]:
# read in only the columns and 10% of the samples

from datetime import datetime
import numpy as np
import pandas as pd 
cab = pd.read_parquet("/Users/michaelbrady/Downloads/ny_taxi_2024_data.parquet", columns=[
    "tpep_pickup_datetime", "tpep_dropoff_datetime", "trip_distance", "PULocationID", "DOLocationID", "fare_amount",
    "tolls_amount", "Airport_fee", "congestion_surcharge"]).sample(frac=0.1, random_state=1)
cab

In [None]:
# set up target class for cab vs. ride share

cab["class"] = 0
cab

In [None]:
# same as above but with 1% of data (This is only one month of ride-share)

fhv = pd.read_parquet("/Users/michaelbrady/Downloads/fhvhv_tripdata_2024-01.parquet", columns=[
    'pickup_datetime', 'dropoff_datetime', 'trip_miles', 'PULocationID', 'DOLocationID', 
    'base_passenger_fare', 'tolls', "airport_fee", 'congestion_surcharge', 'hvfhs_license_num']
    ).sample(frac=0.50, random_state=1)
fhv = fhv.rename(columns={'hvfhs_license_num': 'class'})
fhv["class"] = 1
fhv

In [None]:
fhv["pickup_datetime"].describe()

In [None]:
# match columns and concat

fhv.columns = cab.columns
df = pd.concat([cab, fhv], ignore_index=True)
df

In [None]:
#df["passenger_count"] = df["passenger_count"].fillna(1.0)


In [None]:

#df["passenger_count"] = df["passenger_count"].replace(0.0, 1.0)

In [None]:
#df["passenger_count"].value_counts()

In [None]:
df.head()

In [None]:
df["congestion_surcharge"].value_counts()

In [None]:
df = df[df['congestion_surcharge'].isin([2.5, 0.0])]

In [None]:
#df["mta_tax"].value_counts()

In [None]:
# quick, inprecise handling of outlier values

#df = df[df["mta_tax"].isin([0.50, 0.00])]
df = df[df["trip_distance"] < 100]
df = df.loc[(df["PULocationID"] < 264) & (df["DOLocationID"] < 264)]
df = df.loc[(df["fare_amount"] > 0) & (df["fare_amount"] < 300)]
df

In [None]:
# NIK: USE THESE FEATURES

# converting raw datetime to features usable in most ML models

df['time_diff_seconds'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()
df['second_of_day'] = (
    df['tpep_pickup_datetime'].dt.hour * 3600 + 
    df['tpep_pickup_datetime'].dt.minute * 60 +  
    df['tpep_pickup_datetime'].dt.second)
df['day_of_year'] = df['tpep_pickup_datetime'].dt.day_of_year

# boolean features for holday and weekend categories

df['is_weekend'] = df['tpep_pickup_datetime'].dt.weekday >= 5

import holidays

# US Holidays
us_holidays = holidays.US()

# Create a boolean holiday column
df['is_holiday'] = df['tpep_pickup_datetime'].apply(lambda x: 1 if x.date() in us_holidays else 0)

df

In [None]:
df["morning_rush_hour"] = ((df["tpep_pickup_datetime"].dt.weekday < 5) & 
                           (df["tpep_pickup_datetime"].dt.hour.between(7, 9))).astype(int)
df["evening_rush_hour"] = ((df["tpep_pickup_datetime"].dt.weekday < 5) & 
                           (df["tpep_pickup_datetime"].dt.hour.between(16, 18))).astype(int)

In [None]:
import seaborn as sns

# check for correlations / collinearity

corr_matrix = df.corr()


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.show()

In [None]:
taxi_zone_df = pd.read_csv('https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv')

taxi_zone_df.head(10)

In [None]:
taxi_zone_df.tail(10)

In [None]:
taxi_zone_df['service_zone'].value_counts()

In [None]:
# load in geospatial data

zone_long_lat_data = pd.read_parquet('https://data.source.coop/cholmes/nyc-taxi-zones/taxi_zones_4326.parquet')

In [None]:
zone_long_lat_data.head()

In [None]:
zone_long_lat_data["borough"].value_counts()

In [None]:
# borrowed Nik's beautiful code

def safe_wkb_loads(wkb_string):
    try:
        return wkb.loads(wkb_string)
    except errors.WKTReadingError:
        return Point(0,0)

zone_long_lat_data['geometry'] = zone_long_lat_data['geometry'].apply(safe_wkb_loads)

geo_zone = gpd.GeoDataFrame(zone_long_lat_data, geometry=zone_long_lat_data['geometry'], crs="EPSG:4326")

geo_zone_proj = geo_zone.to_crs("EPSG:3857")

geo_zone_proj['centroid'] = geo_zone_proj['geometry'].centroid

In [None]:
"""
# NIK: USE THESE FEATURES
# merging zone data w/ df for PU info

zone_data = zone_long_lat_data[["LocationID", "borough"]]
zone_data = zone_data.copy()
zone_data.loc[:, "PULocationID"] = zone_data["LocationID"]
df_w_zones = df.merge(zone_data, on="PULocationID", how="left")
# f_w_zones = df_w_zones[df_w_zones["borough"] == "Manhattan"]
df_encoded = pd.get_dummies(zone_data['borough'], prefix="PU")
df1 = pd.concat([df_w_zones, df_encoded], axis=1)
df1 = df1.drop(columns=["LocationID", "borough"])
df1
"""

In [None]:
"""
# merging zone data w/ df for DO info

zone_data = zone_long_lat_data[["LocationID", "borough"]]
zone_data = zone_data.copy()  # Ensure you're working with a separate copy
zone_data.loc[:, "DOLocationID"] = zone_data["LocationID"]
df1 = df.merge(zone_data, on="DOLocationID", how="left")
df_encoded_2 = pd.get_dummies(zone_data['borough'], prefix="DO")
df1 = pd.concat([df1, df_encoded_2], axis=1)
df1 = df1.drop(columns=["LocationID", "borough"])
df1
"""

In [None]:
# NIK: USE THESE FEATURES
# merging zone data w/ df for PU info + borough one-hot

pu_data = zone_long_lat_data[["LocationID", "borough"]].copy()
pu_data.rename(columns={"LocationID": "PULocationID"}, inplace=True)
pu_dummies = pd.get_dummies(pu_data["borough"], prefix="PU")
pu_data = pd.concat([pu_data, pu_dummies], axis=1).drop(columns=["borough"])
df = df.merge(pu_data, on="PULocationID", how="left")
df = df.drop(columns="PU_EWR") #drop for one-hot
df

In [None]:
# NIK: USE THESE FEATURES
# merging zone data w/ df for DO info + borough one-hot

do_data = zone_long_lat_data[["LocationID", "borough"]].copy()
do_data.rename(columns={"LocationID": "DOLocationID"}, inplace=True)
do_dummies = pd.get_dummies(do_data["borough"], prefix="DO")
do_data = pd.concat([do_data, do_dummies], axis=1).drop(columns=["borough"])
df = df.merge(do_data, on="DOLocationID", how="left")
df = df.drop(columns="DO_EWR") #drop for one-hot
df

In [None]:
df1 = df

In [None]:
gdf = geo_zone_proj
gdf

In [None]:
# pulling useful data out of "geometry" column

gdf["centroid_x"] = gdf.geometry.centroid.x
gdf["centroid_y"] = gdf.geometry.centroid.y
gdf["area"] = gdf.geometry.area
gdf["perimeter"] = gdf.geometry.length


In [None]:
# limiting geometry features to centroids for now

gdf = gdf.loc[:, ["centroid_x", "centroid_y", "LocationID"]]
gdf

In [None]:
# # NIK: USE THESE FEATURES
# merging geospatial w/ df
# note: these are not lat/long, they are another system, equally (if not better) for ML.

gdf["PULocationID"] = gdf["LocationID"]
df1 = df1.merge(gdf.rename(columns={"centroid_x": "PUx", "centroid_y": "PUy"}), 
                on="PULocationID", how="left")

gdf["DOLocationID"] = gdf["LocationID"]
df1 = df1.merge(gdf.rename(columns={"centroid_x": "DOx", "centroid_y": "DOy"}), 
                on="DOLocationID", how="left")

df1


In [None]:

df1 = df1.drop(columns=["LocationID_x", "LocationID_y", "PULocationID_x", "PULocationID_x", "PULocationID_y", "DOLocationID"])

df1


In [None]:
# rename columns for easier reference

df1.columns = [["PUtime", "DOtime", "distance", "fare", "tolls", "airport", "congestion", "class", 
    "duration(sec)", "second_of_day", "day_of_year", "weekend", "holiday", "morning_rush", "evening_rush",
    "PU_Bronx", "PU_Brooklyn", "PU_Manhattan", "PU_Queens", "PU_Staten Island", 
    "DO_Bronx", "DO_Brooklyn", "DO_Manhattan", "DO_Queens", "DO_Staten Island", 
    "PUx", "PUy", "DOx", "DOy"]
]
df1.head()

In [None]:
# NIK, Take a look at this example to asses similarities and differences

# example of dataframe for ML modeling (prescaling, incomplete features, etc.)

example_partial_unscaled_df_for_ML = df1[["second_of_day", "day_of_year", "weekend", "holiday", "PUx", "PUy", "DOx", "DOy", 
                                          "distance", "duration(sec)", "fare", "tolls", "airport", "congestion", "class",  "PU_Bronx", "PU_Brooklyn", 
                                          "PU_Manhattan", "PU_Queens", "PU_Staten Island", 
                                          "DO_Bronx", "DO_Brooklyn", "DO_Manhattan", "DO_Queens", "DO_Staten Island", ]]
example_partial_unscaled_df_for_ML

END OF REAL WORK - STOP READING

In [None]:
distances = []
for (index1, row1), (index2, row2) in itertools.product(geo_zone_proj.iterrows(), geo_zone_proj.iterrows()):
    distance_meters = row1['centroid'].distance(row2['centroid'])
    distance_miles = distance_meters * 0.000621371
    distances.append({
        'PULocationID': index1,
        'DOLocationID': index2,
        'distance_miles': distance_miles
    })

distance_result_df = pd.DataFrame(distances)

distance_result_df.tail(30)

In [None]:
df = df.merge(distance_result_df, on=['PULocationID', 'DOLocationID'], how='left')
df

In [None]:
df.loc[500000:500020]

In [None]:
df['percentage_difference'] = (abs(df['trip_distance'] - df['distance_miles']) / df[['trip_distance', 'distance_miles']].max(axis=1)) * 100

df_major_diff = df[df['percentage_difference'] > 50]

df_major_diff

In [None]:
df_distance = df[["trip_distance", "distance_miles", "time_diff_seconds"]]
df_distance.loc[:, "trip_hours"] = df["time_diff_seconds"] / 60 / 60
df_distance = df_distance.drop(columns="time_diff_seconds")
df_distance["mph_data"] = df_distance["trip_distance"] / df_distance["trip_hours"]
df_distance["mph_centroids"] = df_distance["distance_miles"] / df_distance["trip_hours"]


In [None]:
# Cenroid distances won't work. Here's why:

df_distance.loc[500000:500020]

In [None]:

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(gdf)

# Apply PCA
pca = PCA(n_components=4) 
location_pca = pca.fit_transform(X_scaled)

# Convert to DataFrame
location_pca = pd.DataFrame(location_pca, columns=["PC1", "PCA", "PC3", "PC4"])

location_pca


In [None]:
pca.explained_variance_ratio_


In [None]:
centroid_df = gdf.loc[:, ["centroid_x", "centroid_y"]]

In [None]:
scaler = StandardScaler()
centroid_scaled = scaler.fit_transform(centroid_df)

pca2 = PCA(n_components=1) 
centroid_pca = pca2.fit_transform(centroid_scaled)

centroid_pca = pd.DataFrame(centroid_pca, columns=["PC1"])

centroid_pca

In [None]:
pca2.explained_variance_ratio_

In [None]:
'''
pass_impute = df1.drop(columns=["start_time", "end_time", "rating"])
pass_impute
'''

In [None]:
rows_to_fill = pass_impute["pass_count"].isnull()|(pass_impute["pass_count"] == 0.0)
rows_to_fill

In [None]:
train_data = pass_impute[~rows_to_fill]
test_data = pass_impute[rows_to_fill]
train_data

In [None]:
train_data.isnull().value_counts()

In [None]:
X = train_data.drop(columns=["pass_count", "vendor"])
y = train_data["pass_count"]
X

In [None]:
from sklearn.metrics import mean_squared_error
y_pred = np.round(rf.predict(X_test))
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"MSE is {mse}")
print(f"RMSE is {rmse}")

In [None]:
y_train_pred = np.round(rf.predict(X_train))

In [None]:
t_mse = mean_squared_error(y_train, y_train_pred)
t_rmse = np.sqrt(t_mse)
print(f"Training MSE is {t_mse}")
print(f"Training RMSE is {t_rmse}")

In [None]:
features = ["trip_dist", "fare", "tip", "elapsed"]


pass_preds = np.round(rf.predict(test_data[features]))
pass_impute.loc[rows_to_fill, "pass_count"] = pass_preds

pass_impute


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.show()

In [None]:
pass_preds = pd.Series(pass_preds)

In [None]:
pass_impute["pass_count"].value_counts()

In [None]:
importances = rf.feature_importances_
feature_names = X_train.columns
sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)


In [None]:
# Drop vendor column
# Try adding start_time as hour and/or weekend column

In [None]:
pass_preds.value_counts()