In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from scipy import stats
import joblib

In [4]:
# 1. Data Loading and Initial Exploration
# ---------------------------------------

# Load the data
df = pd.read_csv('uber_data.csv')

# Display basic information about the dataset
print(df.info())
print(df.head())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19533 entries, 0 to 19532
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_id                19533 non-null  int64  
 1   tpep_pickup_datetime   19533 non-null  object 
 2   tpep_dropoff_datetime  19533 non-null  object 
 3   pick_hour              19533 non-null  int64  
 4   pick_day               19533 non-null  int64  
 5   pick_month             19533 non-null  int64  
 6   pick_year              19533 non-null  int64  
 7   pick_weekday           19533 non-null  int64  
 8   passenger_count        19533 non-null  int64  
 9   trip_distance          19533 non-null  float64
 10  rate_code_name         19533 non-null  object 
 11  pickup_latitude        19533 non-null  float64
 12  pickup_longitude       19533 non-null  float64
 13  dropoff_latitude       19533 non-null  float64
 14  dropoff_longitude      19533 non-null  float64
 15  pa

In [6]:
df

Unnamed: 0,trip_id,tpep_pickup_datetime,tpep_dropoff_datetime,pick_hour,pick_day,pick_month,pick_year,pick_weekday,passenger_count,trip_distance,...,dropoff_latitude,dropoff_longitude,payment_type_name,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,16220,2016-03-10T12:46:59.0000000,2016-03-10T13:12:12.0000000,12,10,3,2016,5,1,1.41,...,40.784561,-73.949654,Cash,16.0,0.0,0.5,0.00,0.00,0.3,16.80
1,14662,2016-03-10T13:03:27.0000000,2016-03-10T13:13:49.0000000,13,10,3,2016,5,1,13.03,...,40.714912,-74.006432,Credit card,8.0,0.0,0.5,2.20,0.00,0.3,11.00
2,17066,2016-03-01T00:18:49.0000000,2016-03-01T00:27:44.0000000,0,1,3,2016,3,6,7.79,...,40.781975,-73.979523,Cash,9.0,0.5,0.5,0.00,0.00,0.3,10.30
3,19239,2016-03-01T00:53:30.0000000,2016-03-01T01:01:22.0000000,0,1,3,2016,3,1,16.54,...,40.764671,-73.993423,Credit card,9.5,0.5,0.5,2.15,0.00,0.3,12.95
4,16211,2016-03-10T12:39:18.0000000,2016-03-10T13:20:06.0000000,12,10,3,2016,5,1,9.57,...,40.673767,-73.939804,Credit card,40.0,0.0,0.5,9.27,5.54,0.3,55.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19528,13092,2016-03-10T08:42:48.0000000,2016-03-10T09:23:30.0000000,8,10,3,2016,5,1,13.54,...,40.766903,-73.898041,Cash,33.5,0.0,0.5,0.00,5.54,0.3,39.84
19529,4106,2016-03-10T11:26:46.0000000,2016-03-10T11:35:09.0000000,11,10,3,2016,5,1,3.06,...,40.739689,-73.976440,Cash,7.0,0.0,0.5,0.00,0.00,0.3,7.80
19530,2917,2016-03-10T09:49:01.0000000,2016-03-10T09:49:34.0000000,9,10,3,2016,5,5,0.87,...,40.758339,-73.989159,Cash,2.5,0.0,0.5,0.00,0.00,0.3,3.30
19531,2917,2016-03-10T09:49:01.0000000,2016-03-10T09:49:34.0000000,9,10,3,2016,5,5,0.87,...,40.758339,-73.989159,Cash,2.5,0.0,0.5,0.00,0.00,0.3,3.30


In [8]:
# 2. Data Preprocessing and Feature Engineering
# ---------------------------------------------

# Convert datetime columns to datetime type
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

In [10]:
df

Unnamed: 0,trip_id,tpep_pickup_datetime,tpep_dropoff_datetime,pick_hour,pick_day,pick_month,pick_year,pick_weekday,passenger_count,trip_distance,...,dropoff_latitude,dropoff_longitude,payment_type_name,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,16220,2016-03-10 12:46:59,2016-03-10 13:12:12,12,10,3,2016,5,1,1.41,...,40.784561,-73.949654,Cash,16.0,0.0,0.5,0.00,0.00,0.3,16.80
1,14662,2016-03-10 13:03:27,2016-03-10 13:13:49,13,10,3,2016,5,1,13.03,...,40.714912,-74.006432,Credit card,8.0,0.0,0.5,2.20,0.00,0.3,11.00
2,17066,2016-03-01 00:18:49,2016-03-01 00:27:44,0,1,3,2016,3,6,7.79,...,40.781975,-73.979523,Cash,9.0,0.5,0.5,0.00,0.00,0.3,10.30
3,19239,2016-03-01 00:53:30,2016-03-01 01:01:22,0,1,3,2016,3,1,16.54,...,40.764671,-73.993423,Credit card,9.5,0.5,0.5,2.15,0.00,0.3,12.95
4,16211,2016-03-10 12:39:18,2016-03-10 13:20:06,12,10,3,2016,5,1,9.57,...,40.673767,-73.939804,Credit card,40.0,0.0,0.5,9.27,5.54,0.3,55.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19528,13092,2016-03-10 08:42:48,2016-03-10 09:23:30,8,10,3,2016,5,1,13.54,...,40.766903,-73.898041,Cash,33.5,0.0,0.5,0.00,5.54,0.3,39.84
19529,4106,2016-03-10 11:26:46,2016-03-10 11:35:09,11,10,3,2016,5,1,3.06,...,40.739689,-73.976440,Cash,7.0,0.0,0.5,0.00,0.00,0.3,7.80
19530,2917,2016-03-10 09:49:01,2016-03-10 09:49:34,9,10,3,2016,5,5,0.87,...,40.758339,-73.989159,Cash,2.5,0.0,0.5,0.00,0.00,0.3,3.30
19531,2917,2016-03-10 09:49:01,2016-03-10 09:49:34,9,10,3,2016,5,5,0.87,...,40.758339,-73.989159,Cash,2.5,0.0,0.5,0.00,0.00,0.3,3.30


In [12]:
# Calculate trip duration in minutes
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

In [14]:
df

Unnamed: 0,trip_id,tpep_pickup_datetime,tpep_dropoff_datetime,pick_hour,pick_day,pick_month,pick_year,pick_weekday,passenger_count,trip_distance,...,dropoff_longitude,payment_type_name,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration
0,16220,2016-03-10 12:46:59,2016-03-10 13:12:12,12,10,3,2016,5,1,1.41,...,-73.949654,Cash,16.0,0.0,0.5,0.00,0.00,0.3,16.80,25.216667
1,14662,2016-03-10 13:03:27,2016-03-10 13:13:49,13,10,3,2016,5,1,13.03,...,-74.006432,Credit card,8.0,0.0,0.5,2.20,0.00,0.3,11.00,10.366667
2,17066,2016-03-01 00:18:49,2016-03-01 00:27:44,0,1,3,2016,3,6,7.79,...,-73.979523,Cash,9.0,0.5,0.5,0.00,0.00,0.3,10.30,8.916667
3,19239,2016-03-01 00:53:30,2016-03-01 01:01:22,0,1,3,2016,3,1,16.54,...,-73.993423,Credit card,9.5,0.5,0.5,2.15,0.00,0.3,12.95,7.866667
4,16211,2016-03-10 12:39:18,2016-03-10 13:20:06,12,10,3,2016,5,1,9.57,...,-73.939804,Credit card,40.0,0.0,0.5,9.27,5.54,0.3,55.61,40.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19528,13092,2016-03-10 08:42:48,2016-03-10 09:23:30,8,10,3,2016,5,1,13.54,...,-73.898041,Cash,33.5,0.0,0.5,0.00,5.54,0.3,39.84,40.700000
19529,4106,2016-03-10 11:26:46,2016-03-10 11:35:09,11,10,3,2016,5,1,3.06,...,-73.976440,Cash,7.0,0.0,0.5,0.00,0.00,0.3,7.80,8.383333
19530,2917,2016-03-10 09:49:01,2016-03-10 09:49:34,9,10,3,2016,5,5,0.87,...,-73.989159,Cash,2.5,0.0,0.5,0.00,0.00,0.3,3.30,0.550000
19531,2917,2016-03-10 09:49:01,2016-03-10 09:49:34,9,10,3,2016,5,5,0.87,...,-73.989159,Cash,2.5,0.0,0.5,0.00,0.00,0.3,3.30,0.550000


In [16]:
# Create time-based features
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_day'] = df['tpep_pickup_datetime'].dt.day
df['pickup_month'] = df['tpep_pickup_datetime'].dt.month
df['pickup_year'] = df['tpep_pickup_datetime'].dt.year
df['pickup_weekday'] = df['tpep_pickup_datetime'].dt.weekday

In [18]:
df

Unnamed: 0,trip_id,tpep_pickup_datetime,tpep_dropoff_datetime,pick_hour,pick_day,pick_month,pick_year,pick_weekday,passenger_count,trip_distance,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,pickup_hour,pickup_day,pickup_month,pickup_year,pickup_weekday
0,16220,2016-03-10 12:46:59,2016-03-10 13:12:12,12,10,3,2016,5,1,1.41,...,0.00,0.00,0.3,16.80,25.216667,12,10,3,2016,3
1,14662,2016-03-10 13:03:27,2016-03-10 13:13:49,13,10,3,2016,5,1,13.03,...,2.20,0.00,0.3,11.00,10.366667,13,10,3,2016,3
2,17066,2016-03-01 00:18:49,2016-03-01 00:27:44,0,1,3,2016,3,6,7.79,...,0.00,0.00,0.3,10.30,8.916667,0,1,3,2016,1
3,19239,2016-03-01 00:53:30,2016-03-01 01:01:22,0,1,3,2016,3,1,16.54,...,2.15,0.00,0.3,12.95,7.866667,0,1,3,2016,1
4,16211,2016-03-10 12:39:18,2016-03-10 13:20:06,12,10,3,2016,5,1,9.57,...,9.27,5.54,0.3,55.61,40.800000,12,10,3,2016,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19528,13092,2016-03-10 08:42:48,2016-03-10 09:23:30,8,10,3,2016,5,1,13.54,...,0.00,5.54,0.3,39.84,40.700000,8,10,3,2016,3
19529,4106,2016-03-10 11:26:46,2016-03-10 11:35:09,11,10,3,2016,5,1,3.06,...,0.00,0.00,0.3,7.80,8.383333,11,10,3,2016,3
19530,2917,2016-03-10 09:49:01,2016-03-10 09:49:34,9,10,3,2016,5,5,0.87,...,0.00,0.00,0.3,3.30,0.550000,9,10,3,2016,3
19531,2917,2016-03-10 09:49:01,2016-03-10 09:49:34,9,10,3,2016,5,5,0.87,...,0.00,0.00,0.3,3.30,0.550000,9,10,3,2016,3


In [20]:
# New feature: Rush hour flag
df['is_rush_hour'] = ((df['pickup_hour'] >= 7) & (df['pickup_hour'] <= 10) | 
                      (df['pickup_hour'] >= 16) & (df['pickup_hour'] <= 19)).astype(int)

# New feature: Weekend flag
df['is_weekend'] = (df['pickup_weekday'] >= 5).astype(int)

# New feature: Night time flag
df['is_night'] = ((df['pickup_hour'] >= 22) | (df['pickup_hour'] <= 5)).astype(int)

# New feature: Trip speed (mph)
df['trip_speed'] = df['trip_distance'] / (df['trip_duration'] / 60)

In [22]:
df

Unnamed: 0,trip_id,tpep_pickup_datetime,tpep_dropoff_datetime,pick_hour,pick_day,pick_month,pick_year,pick_weekday,passenger_count,trip_distance,...,trip_duration,pickup_hour,pickup_day,pickup_month,pickup_year,pickup_weekday,is_rush_hour,is_weekend,is_night,trip_speed
0,16220,2016-03-10 12:46:59,2016-03-10 13:12:12,12,10,3,2016,5,1,1.41,...,25.216667,12,10,3,2016,3,0,0,0,3.354924
1,14662,2016-03-10 13:03:27,2016-03-10 13:13:49,13,10,3,2016,5,1,13.03,...,10.366667,13,10,3,2016,3,0,0,0,75.414791
2,17066,2016-03-01 00:18:49,2016-03-01 00:27:44,0,1,3,2016,3,6,7.79,...,8.916667,0,1,3,2016,1,0,0,1,52.418692
3,19239,2016-03-01 00:53:30,2016-03-01 01:01:22,0,1,3,2016,3,1,16.54,...,7.866667,0,1,3,2016,1,0,0,1,126.152542
4,16211,2016-03-10 12:39:18,2016-03-10 13:20:06,12,10,3,2016,5,1,9.57,...,40.800000,12,10,3,2016,3,0,0,0,14.073529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19528,13092,2016-03-10 08:42:48,2016-03-10 09:23:30,8,10,3,2016,5,1,13.54,...,40.700000,8,10,3,2016,3,1,0,0,19.960688
19529,4106,2016-03-10 11:26:46,2016-03-10 11:35:09,11,10,3,2016,5,1,3.06,...,8.383333,11,10,3,2016,3,0,0,0,21.900596
19530,2917,2016-03-10 09:49:01,2016-03-10 09:49:34,9,10,3,2016,5,5,0.87,...,0.550000,9,10,3,2016,3,1,0,0,94.909091
19531,2917,2016-03-10 09:49:01,2016-03-10 09:49:34,9,10,3,2016,5,5,0.87,...,0.550000,9,10,3,2016,3,1,0,0,94.909091


In [24]:
# 3. Handling Outliers
# --------------------

# Function to remove outliers using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from key numerical columns
for column in ['fare_amount', 'trip_distance', 'trip_duration', 'trip_speed']:
    df = remove_outliers(df, column)

print(f"Shape after removing outliers: {df.shape}")

Shape after removing outliers: (16143, 33)


In [26]:
# 4. Feature Selection
# --------------------

features = ['pickup_hour', 'pickup_day', 'pickup_month', 'pickup_year', 'pickup_weekday',
            'passenger_count', 'trip_distance', 'pickup_latitude', 'pickup_longitude',
            'dropoff_latitude', 'dropoff_longitude', 'trip_duration', 'is_rush_hour',
            'is_weekend', 'is_night', 'trip_speed']

X = df[features]
y = df['fare_amount']

In [28]:
# 5. Data Splitting
# -----------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# 6. Model Building and Evaluation
# --------------------------------

# Define preprocessing steps
numeric_features = ['pickup_hour', 'pickup_day', 'pickup_month', 'pickup_year',
                    'passenger_count', 'trip_distance', 'pickup_latitude', 'pickup_longitude',
                    'dropoff_latitude', 'dropoff_longitude', 'trip_duration', 'trip_speed']
categorical_features = ['pickup_weekday', 'is_rush_hour', 'is_weekend', 'is_night']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Define models
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(random_state=42))
])

In [32]:
# Function to evaluate model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return mse, rmse, r2

# Evaluate Random Forest
rf_mse, rf_rmse, rf_r2 = evaluate_model(rf_model, X_train, X_test, y_train, y_test)
print("Random Forest Results:")
print(f'Mean Squared Error: {rf_mse}')
print(f'Root Mean Squared Error: {rf_rmse}')
print(f'R-squared Score: {rf_r2}')

Random Forest Results:
Mean Squared Error: 4.497480667402959
Root Mean Squared Error: 2.1207264480368417
R-squared Score: 0.8209341872197187


In [34]:
# Evaluate XGBoost
xgb_mse, xgb_rmse, xgb_r2 = evaluate_model(xgb_model, X_train, X_test, y_train, y_test)
print("\nXGBoost Results:")
print(f'Mean Squared Error: {xgb_mse}')
print(f'Root Mean Squared Error: {xgb_rmse}')
print(f'R-squared Score: {xgb_r2}')


XGBoost Results:
Mean Squared Error: 4.358088807383833
Root Mean Squared Error: 2.0876036039880352
R-squared Score: 0.8264840313558344


In [36]:
# 7. Hyperparameter Tuning for XGBoost
# ------------------------------------

param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1]
}

grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", -grid_search.best_score_)

5 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Khalid Abdelaty\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Khalid Abdelaty\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Khalid Abdelaty\AppData\Roaming\Python\Python312\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_p


Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__n_estimators': 100}
Best cross-validation score: 3.8611509041881016


In [38]:
# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nBest XGBoost Model Results:")
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared Score: {r2}')


Best XGBoost Model Results:
Mean Squared Error: 4.008365064412631
Root Mean Squared Error: 2.0020901738964283
R-squared Score: 0.8404081748741348


In [40]:
# 8. Cross-Validation
# -------------------

cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)

print("\nCross-Validation Results:")
print(f"RMSE scores: {cv_rmse}")
print(f"Mean RMSE: {cv_rmse.mean()}")
print(f"Standard deviation of RMSE: {cv_rmse.std()}")


Cross-Validation Results:
RMSE scores: [2.07157373 2.20575128 2.02140114 1.80912825 1.88261581]
Mean RMSE: 1.9980940438270545
Standard deviation of RMSE: 0.14007662823067674


In [44]:
# 10. Model Persistence
# ---------------------

joblib.dump(best_model, 'uber_fare_prediction_model_improved.joblib')
print("Improved model saved successfully.")


Improved model saved successfully.


In [46]:
# 11. Model Loading and Prediction
# --------------------------------

loaded_model = joblib.load('uber_fare_prediction_model_improved.joblib')

# Example prediction
sample_trip = pd.DataFrame({
    'pickup_hour': [14],
    'pickup_day': [15],
    'pickup_month': [3],
    'pickup_year': [2016],
    'pickup_weekday': [2],
    'passenger_count': [2],
    'trip_distance': [5.2],
    'pickup_latitude': [40.7614],
    'pickup_longitude': [-73.9798],
    'dropoff_latitude': [40.7624],
    'dropoff_longitude': [-73.9834],
    'trip_duration': [15.5],
    'is_rush_hour': [0],
    'is_weekend': [0],
    'is_night': [0],
    'trip_speed': [20.8]  # 5.2 miles / (15.5 minutes / 60) ≈ 20.8 mph
})

predicted_fare = loaded_model.predict(sample_trip)
print(f"Predicted fare for the sample trip: ${predicted_fare[0]:.2f}")


ValueError: Found unknown categories [2] in column 0 during transform