In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from geopy.geocoders import Nominatim
from shapely.geometry import Point
from shapely import wkb
from shapely import errors
import itertools
import matplotlib.pyplot as plt
import datetime as dt
import holidays
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


In [None]:
df = pd.read_parquet("../nik_folder/Resources/fixed_model_df.parquet")

In [None]:
pd.set_option('display.max_columns', None)
df

In [None]:
pd.set_option('display.max_columns', None)
df.columns = df.columns.str.replace(r"[()',]", "", regex=True).str.strip()
df = df.replace({True: 1, False: 0})

df.rename(columns={"fore": "fare"}, inplace=True)
df.rename(columns={"class": "service"}, inplace=True)
df

In [None]:
df["service"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [None]:
df_taxi = df[(df["service"] == 0) & (df["fare"] > 0)]
df_uber = df[(df["service"] == 1) & (df["fare"] > 0)]
df_lyft = df[(df["service"] == 2) & (df["fare"] > 0)]

# removing total for leakage

features = ["second_of_day", "day_of_year", "PUx", "PUy", "DOx", "DOy", "distance", "morning_rush", "evening rush",
            "prcp", "temp", "holiday", "weekend", "airport", "congestion", "PU_Bronx", "PU_Brooklyn", 
            "PU_Manhattan", "PU_Queens", "PU_Staten Island", "DO_Bronx", "DO_Brooklyn", 
            "DO_Manhattan", "DO_Queens", "DO_Staten Island"]

In [None]:
"""
taxi_model = LinearRegression().fit(df_taxi[features], df_taxi["fare"])
uber_model = LinearRegression().fit(df_uber[features], df_uber["fare"])
lyft_model = LinearRegression().fit(df_lyft[features], df_lyft["fare"])
"""

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

# final NaN check
df = df.dropna()

y = df_taxi["fare"] 
X = df_taxi[features]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV


In [None]:

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
}

random_search = RandomizedSearchCV(xgb.XGBRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
y_pred_xgb = random_search.best_estimator_.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"XGBoost MAE: {mae_xgb:.2f}")
r2_via_metric = r2_score(y_test, y_pred_xgb)
print("R² via r2_score:", r2_via_metric)

In [None]:
df_taxi["fare"].describe()

In [None]:
y = df_uber["fare"] 
X = df_uber[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

random_search = RandomizedSearchCV(xgb.XGBRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
y_pred_xgb = random_search.best_estimator_.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"XGBoost MAE: {mae_xgb}")
r2_via_metric = r2_score(y_test, y_pred_xgb)
print("R² via r2_score:", r2_via_metric)



In [None]:
y = df_lyft["fare"] 
X = df_lyft[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

random_search = RandomizedSearchCV(xgb.XGBRegressor(random_state=42), param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
y_pred_xgb = random_search.best_estimator_.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"XGBoost MAE: {mae_xgb}")

r2_via_metric = r2_score(y_test, y_pred_xgb)
print("R² via r2_score:", r2_via_metric)


In [None]:
#import joblib

#joblib.dump(model, "log_regres_first.pkl")


In [None]:
# fix class imbalance

#undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=42)

#X_train, y_train = undersampler.fit_resample(X_train, y_train)