In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings('ignore')

In [None]:
url = "https://data.cityofnewyork.us/api/views/2yzn-sicd/rows.csv?accessType=DOWNLOAD"
df = pd.read_csv(url, nrows=2000000)

In [None]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()/60
df = df[df['trip_duration'] > 0]
df['VendorID_text'] = df['VendorID'].replace({1: 'Vendor_A', 2: 'Vendor_B'})
df = df[['VendorID_text', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
         'PULocationID', 'DOLocationID', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
         'tolls_amount', 'improvement_surcharge', 'total_amount', 'trip_duration']]
df = df.dropna()

In [None]:
plt.figure(figsize=(12,8))
df.hist(bins=50, figsize=(15,10))
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,8))
sns.scatterplot(x='trip_distance', y='fare_amount', data=df.sample(10000))
plt.show()

plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, fmt=".2f")
plt.show()

plt.figure(figsize=(12,8))
sns.boxplot(data=df.select_dtypes(include=[np.number]))
plt.xticks(rotation=90)
plt.show()

In [None]:
num_features = ['passenger_count','trip_distance','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','trip_duration']
cat_features = ['VendorID_text','RatecodeID','store_and_fwd_flag','PULocationID','DOLocationID']
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('std_scaler', StandardScaler())])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy="most_frequent")),('onehot', OneHotEncoder(handle_unknown='ignore'))])
full_pipeline = ColumnTransformer([("num", num_pipeline, num_features),("cat", cat_pipeline, cat_features)])
df_prepared = full_pipeline.fit_transform(df)

In [None]:
df['trip_duration_bin'] = pd.cut(df['trip_duration'], bins=[0,5,10,20,40,80, np.inf], labels=[0,1,2,3,4,5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['trip_duration_bin']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
df_train = strat_train_set.drop("trip_duration_bin", axis=1)
df_test = strat_test_set.drop("trip_duration_bin", axis=1)
X_train = full_pipeline.fit_transform(df_train)
y_train = df_train['trip_duration'].values
X_test = full_pipeline.transform(df_test)
y_test = df_test['trip_duration'].values

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}
cv_scores = {}
for name, model in models.items():
    scores = -cross_val_score(model, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error")
    cv_scores[name] = np.mean(scores)
results = pd.DataFrame(list(cv_scores.items()), columns=["Model", "CV_RMSE"])
print(results)

In [None]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring="neg_root_mean_squared_error", return_train_score=True)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)

In [None]:
final_predictions = best_model.predict(X_test)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False)
final_r2 = r2_score(y_test, final_predictions)
print("Final RMSE:", final_rmse)
print("Final R2:", final_r2)
plt.figure(figsize=(10,6))
plt.scatter(y_test, final_predictions, alpha=0.3)
plt.xlabel("True Trip Duration")
plt.ylabel("Predicted Trip Duration")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')
plt.show()

In [None]:
dl_model = Sequential()
dl_model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
dl_model.add(Dropout(0.2))
dl_model.add(Dense(64, activation='relu'))
dl_model.add(Dropout(0.2))
dl_model.add(Dense(1))
dl_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
dl_model.fit(X_train, y_train, epochs=10, batch_size=256, validation_split=0.2, verbose=0)
dl_predictions = dl_model.predict(X_test).flatten()
dl_rmse = mean_squared_error(y_test, dl_predictions, squared=False)
dl_r2 = r2_score(y_test, dl_predictions)
print("Deep Learning RMSE:", dl_rmse)
print("Deep Learning R2:", dl_r2)

In [None]:
overfit_model = RandomForestRegressor(n_estimators=200, max_depth=50, random_state=42)
X_small, _, y_small, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42)
overfit_model.fit(X_small, y_small)
train_pred = overfit_model.predict(X_small)
test_pred = overfit_model.predict(X_test)
overfit_train_rmse = mean_squared_error(y_small, train_pred, squared=False)
overfit_test_rmse = mean_squared_error(y_test, test_pred, squared=False)
print("Overfit Model - Train RMSE:", overfit_train_rmse)
print("Overfit Model - Test RMSE:", overfit_test_rmse)

In [None]:
deployment_code = '''
import streamlit as st
import numpy as np
import pickle
st.title("Trip Duration Prediction")
uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
if uploaded_file is not None:
    data = pd.read_csv(uploaded_file)
    data_prepared = full_pipeline.transform(data)
    prediction = best_model.predict(data_prepared)
    st.write("Predicted Trip Duration (in minutes):", prediction)
'''
with open("app.py", "w") as f:
    f.write(deployment_code)