In [1]:
# Block 1: Necessary imports for data processing and modeling
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
from geopy.distance import geodesic
import numpy as np
from scipy.stats import skew, kurtosis


In [2]:
# Block 2: Data Loading and Cleaning
file_path = 'uber.csv'  # File path for your dataset
uber_data = pd.read_csv(file_path)

# Convert pickup_datetime to datetime and create time-related features
uber_data['pickup_datetime'] = pd.to_datetime(uber_data['pickup_datetime'])
uber_data['hour'] = uber_data['pickup_datetime'].dt.hour
uber_data['day_of_week'] = uber_data['pickup_datetime'].dt.dayofweek
uber_data['month'] = uber_data['pickup_datetime'].dt.month
uber_data['year'] = uber_data['pickup_datetime'].dt.year
uber_data['is_weekend'] = uber_data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Step 1: Removing Unrealistic Values
# Remove unrealistic fare_amount values (e.g., less than zero)
uber_data_clean = uber_data[uber_data['fare_amount'] >= 0]

# Remove unrealistic passenger_count values (e.g., less than 1 or greater than 6)
uber_data_clean = uber_data_clean[(uber_data_clean['passenger_count'] >= 1) & (uber_data_clean['passenger_count'] <= 6)]

# Remove rows with invalid latitude/longitude values (filter for New York City region)
uber_data_clean = uber_data_clean[
    (uber_data_clean['pickup_latitude'].between(40.4774, 40.9176)) &
    (uber_data_clean['pickup_longitude'].between(-74.2591, -73.7004)) &
    (uber_data_clean['dropoff_latitude'].between(40.4774, 40.9176)) &
    (uber_data_clean['dropoff_longitude'].between(-74.2591, -73.7004))
]

# Step 2: Handling Missing Values
# Drop rows with missing values (since we only have very few missing values)
uber_data_clean = uber_data_clean.dropna()

# Step 3: Log Transformation to Reduce Skewness (for 'fare_amount' and 'distance')
uber_data_clean['fare_amount'] = np.log1p(uber_data_clean['fare_amount'])  # log1p to handle zero values safely

# Step 4: Recalculate Distance Based on Cleaned Data
def calculate_distance(row):
    pickup_coords = (row['pickup_latitude'], row['pickup_longitude'])
    dropoff_coords = (row['dropoff_latitude'], row['dropoff_longitude'])
    return geodesic(pickup_coords, dropoff_coords).miles

uber_data_clean['distance'] = uber_data_clean.apply(calculate_distance, axis=1)

# Step 5: One-Hot Encoding for Categorical Variables
uber_data_clean = pd.get_dummies(uber_data_clean, columns=['day_of_week', 'hour', 'month', 'year'], drop_first=True)


In [3]:
# Block 2.1: Descriptive Statistics After Data Cleaning
# Descriptive statistics summary for numerical features
print("Numerical Descriptive Statistics (Cleaned Data):")
print(uber_data_clean.describe())

# Descriptive statistics summary for categorical features (if any)
print("\nCategorical Feature Overview (Cleaned Data):")
print(uber_data_clean.describe(include=['O']))  # 'O' represents object data type, typically used for categorical data

# Count missing values for each column
print("\nMissing Values in Each Column (Cleaned Data):")
print(uber_data_clean.isna().sum())

# Count of unique values for each categorical feature
categorical_columns = uber_data_clean.select_dtypes(include=['object']).columns
for column in categorical_columns:
    print(f"\nUnique Values in '{column}' Column (Cleaned Data):")
    print(uber_data_clean[column].value_counts())

# Correlation matrix to understand relationships between numerical features only
print("\nCorrelation Matrix (Numerical Features Only, Cleaned Data):")
numerical_data = uber_data_clean.select_dtypes(include=['float64', 'int64'])
print(numerical_data.corr())

# Additional custom summary for specific insights (e.g., range, skewness, etc.)
# Calculating skewness and kurtosis for numerical columns
for column in numerical_data.columns:
    column_skewness = skew(numerical_data[column].dropna())
    column_kurtosis = kurtosis(numerical_data[column].dropna())
    print(f"\nSkewness and Kurtosis for '{column}' (Cleaned Data):")
    print(f"Skewness: {column_skewness}")
    print(f"Kurtosis: {column_kurtosis}")


Numerical Descriptive Statistics (Cleaned Data):
         Unnamed: 0    fare_amount  pickup_longitude  pickup_latitude  \
count  1.948240e+05  194824.000000     194824.000000    194824.000000   
mean   2.771291e+07       2.332485        -73.975513        40.750779   
std    1.600878e+07       0.544381          0.034482         0.026933   
min    1.000000e+00       0.000000        -74.243432        40.498988   
25%    1.383094e+07       1.945910        -73.992277        40.736465   
50%    2.775954e+07       2.251292        -73.982119        40.753290   
75%    4.154486e+07       2.602690        -73.968412        40.767512   
max    5.542357e+07       6.214608        -73.702735        40.917048   

       dropoff_longitude  dropoff_latitude  passenger_count     is_weekend  \
count      194824.000000     194824.000000    194824.000000  194824.000000   
mean          -73.974592         40.751041         1.689853       0.283697   
std             0.034083          0.030687         1.305750

In [4]:
# Block 2.2: Additional Data Cleaning
# Drop the 'key' column as it is not useful for modeling
uber_data_clean = uber_data_clean.drop(columns=['key', 'Unnamed: 0'])

# Optional: Apply log transformation to highly skewed features
uber_data_clean['distance'] = np.log1p(uber_data_clean['distance'])  # Log transform distance to reduce skewness

# Check skewness again after transformation
print("\nUpdated Skewness for 'distance':")
print(skew(uber_data_clean['distance'].dropna()))



Updated Skewness for 'distance':
0.9662331991613881


In [5]:
# Block 3: Train-Test Split
X = uber_data_clean.drop(columns=['fare_amount', 'pickup_datetime'])
y = uber_data_clean['fare_amount']

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Block 4: Hyperparameter Tuning with RandomizedSearchCV
# Define hyperparameter grid
param_dist = {
    'n_estimators': [200, 300, 400, 500],  # Increase number of boosting rounds
    'max_depth': [4, 6, 8, 10],  # Deeper trees for more complex relationships
    'learning_rate': [0.01, 0.05, 0.1],  # Try a range of learning rates
    'subsample': [0.6, 0.8, 1.0],  # Use different subsets of data per boosting round
    'reg_alpha': [0, 0.01, 0.1],  # L1 regularization (Lasso)
    'reg_lambda': [1, 5, 10]  # L2 regularization (Ridge)
}

# Initialize XGBoost Regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Randomized Search with 5-fold Cross-Validation
random_search = RandomizedSearchCV(estimator=xgb_regressor, param_distributions=param_dist, n_iter=20, 
                                   scoring='neg_mean_squared_error', n_jobs=-1, cv=5, verbose=2, random_state=42)

# Fit Randomized Search to the Training Data
random_search.fit(X_train, y_train)

# Best hyperparameters
best_params = random_search.best_params_
print(f"Best Hyperparameters: {best_params}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
# Block 5: Training the Best Model with Early Stopping using DMatrix
# Convert the data into DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define the parameters using the best params from RandomizedSearch
params = {
    'objective': 'reg:squarederror',
    'learning_rate': best_params['learning_rate'],
    'max_depth': best_params['max_depth'],
    'subsample': best_params['subsample'],
    'reg_alpha': best_params['reg_alpha'],
    'reg_lambda': best_params['reg_lambda']
}

# Training with early stopping
evals = [(dtrain, 'train'), (dtest, 'eval')]  # Evaluation sets
bst_model = xgb.train(
    params,
    dtrain,
    num_boost_round=best_params['n_estimators'],
    evals=evals,
    early_stopping_rounds=15,  # Stop after 15 rounds without improvement
    verbose_eval=True
)


In [None]:
# Block 6: Cross-Validation Evaluation
# Cross-validation to evaluate the model
cv_scores = cross_val_score(random_search.best_estimator_, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)
print(f"Cross-Validation RMSE Scores: {cv_rmse_scores}")
print(f"Average Cross-Validation RMSE: {np.mean(cv_rmse_scores)}")


In [None]:
# Block 7: Model Evaluation on the Test Set
# Make predictions on the test set
y_pred = bst_model.predict(dtest)

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # Use squared=False for RMSE
r2 = r2_score(y_test, y_pred)  # R-squared value

# Output the performance
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared (R²): {r2}")

# Final Performance
print(f"Final Root Mean Squared Error: {rmse}")
