# Create Best Model Using Uber Dataset

#### Import Dependencies

In [142]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import IsolationForest, RandomForestRegressor,GradientBoostingRegressor, HistGradientBoostingRegressor, AdaBoostRegressor

# model
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
#model_selection
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
# metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna
from tqdm import tqdm

#### Data Loading, Cleaning, Preprocessing

In [143]:
df = pd.read_csv("../data/uber.csv", parse_dates=['pickup_datetime'])
df.drop(['Unnamed: 0','key'], axis=1, inplace=True)
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5


In [144]:
print(f'Inference: The Dataset consists of {df.shape[1]} features & {df.shape[0]} samples.')

Inference: The Dataset consists of 7 features & 200000 samples.


In [145]:
nullValues = pd.DataFrame(df.isnull().sum().sort_values(), columns=['Total Null Values'])
nullValues['Percentage'] = round(nullValues['Total Null Values']/df.shape[0],3)*100
print(nullValues)

                   Total Null Values  Percentage
fare_amount                        0         0.0
pickup_datetime                    0         0.0
pickup_longitude                   0         0.0
pickup_latitude                    0         0.0
passenger_count                    0         0.0
dropoff_longitude                  1         0.0
dropoff_latitude                   1         0.0


In [146]:
df.dropna(inplace=True)

In [147]:
df.isnull().sum()# after clean dataset

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [148]:
df.duplicated().sum()#checking duplicate values

0

In [149]:
df_delete = df[(df['pickup_latitude']>180) | (df['pickup_latitude']<-180)|
                    (df['dropoff_latitude']>180) | (df['dropoff_latitude']< -180)|
                    (df['pickup_longitude']>90) | (df['pickup_longitude']<-90) |
                    (df['dropoff_longitude']>90) | (df['dropoff_longitude']<-90)]
df_delete.shape

(13, 7)

In [150]:
index_to_delete = df_delete.index
index_to_delete

Index([  4949,  32549,  40908,  48506,  56617,  61793,  75851,  91422, 103745,
       139447, 144253, 161652, 199936],
      dtype='int64')

In [151]:
before = df.shape
df.drop(index_to_delete,inplace=True)
print(f'before :{before[0]},after :{df.shape[0]}')

before :199999,after :199986


In [152]:
df['year'] = df.pickup_datetime.dt.year
df['month'] = df.pickup_datetime.dt.month
df['weekday'] = df.pickup_datetime.dt.weekday
df['hour'] = df.pickup_datetime.dt.hour

In [153]:
df['Monthly_Quarter'] = df.month.map({1:'Q1',2:'Q1',3:'Q1',4:'Q2',5:'Q2',6:'Q2',7:'Q3',
                                      8:'Q3',9:'Q3',10:'Q4',11:'Q4',12:'Q4'})

df['Hourly_Segments'] = df.hour.map({0:'H1',1:'H1',2:'H1',3:'H1',4:'H2',5:'H2',6:'H2',7:'H2',8:'H3',
                                     9:'H3',10:'H3',11:'H3',12:'H4',13:'H4',14:'H4',15:'H4',16:'H5',
                                     17:'H5',18:'H5',19:'H5',20:'H6',21:'H6',22:'H6',23:'H6'})

In [154]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour,Monthly_Quarter,Hourly_Segments
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,3,19,Q2,H5
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2009,7,4,20,Q3,H6
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,2009,8,0,21,Q3,H6
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,4,8,Q2,H3
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,3,17,Q3,H5


In [155]:
df.drop(['pickup_datetime','month', 'hour',], axis=1, inplace=True)

##### Calculate Distance

In [156]:
from math import radians, sin, cos, sqrt, asin

def distance_transform(longitude1, latitude1, longitude2, latitude2):
    travel_dist = []
    
    for pos in range(len(longitude1)):
        long1,lati1,long2,lati2 = map(radians,[longitude1[pos],latitude1[pos],longitude2[pos],latitude2[pos]])
        dist_long = long2 - long1
        dist_lati = lati2 - lati1
        a = sin(dist_lati/2)**2 + cos(lati1) * cos(lati2) * sin(dist_long/2)**2
        c = 2 * asin(sqrt(a))*6371
        travel_dist.append(c)
       
    return travel_dist

In [157]:
df['distance_traveled']=distance_transform(df['pickup_longitude'].to_numpy(),
                                  df['pickup_latitude'].to_numpy(),
                                  df['dropoff_longitude'].to_numpy(),
                                  df['dropoff_latitude'].to_numpy())
## This Distance is in kilometers
df['distance_traveled'] = df['distance_traveled'].round(3)

In [158]:
df[df['fare_amount']<= 0].shape

(22, 11)

In [159]:
df_fare_delete = df[df['fare_amount']<=0]
df_fare_delete.shape

(22, 11)

In [160]:
old_shape = df.shape
index_to_delete_fare = df_fare_delete.index
df.drop(index_to_delete_fare,inplace=True)
print(f"Number of items removed : {old_shape[0] - df.shape[0]}")

Number of items removed : 22


In [161]:
df['passenger_count'].value_counts()

passenger_count
1      138404
2       29423
5       14004
3        8878
4        4275
6        4271
0         708
208         1
Name: count, dtype: int64

In [162]:
old_shape = df.shape
index_to_delete_passenger = df[df['passenger_count']==df['passenger_count'].max()].index
df.drop(index_to_delete_passenger,inplace=True)
print(f"After drop {old_shape[0] - df.shape[0]}")

After drop 1


In [163]:
# now the large passenger count is removed 208 column
df['passenger_count'].value_counts()

passenger_count
1    138404
2     29423
5     14004
3      8878
4      4275
6      4271
0       708
Name: count, dtype: int64

In [164]:
# here we will replace 0 with 1 count
df['passenger_count']=np.where(df['passenger_count']==0,1,df['passenger_count'])

In [165]:
df['passenger_count'].value_counts()

passenger_count
1    139112
2     29423
5     14004
3      8878
4      4275
6      4271
Name: count, dtype: int64

In [166]:
# removing zero distance travel
index_to_delete_distance = df[df['distance_traveled'] == 0].index
df.drop(index_to_delete_distance,inplace=True)
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,distance_traveled
count,194247.0,194247.0,194247.0,194247.0,194247.0,194247.0,194247.0,194247.0,194247.0
mean,11.349822,-73.82648,40.646902,-73.837556,40.651579,1.687413,2011.74742,3.048577,21.06044
std,9.722955,3.660507,2.93164,3.536295,2.900095,1.303961,1.859941,1.946791,384.091421
min,0.01,-89.933333,-74.015515,-75.458979,-74.01575,1.0,2009.0,0.0,0.001
25%,6.0,-73.992269,40.736348,-73.991589,40.73522,1.0,2010.0,1.0,1.283
50%,8.5,-73.982116,40.753253,-73.980539,40.753708,1.0,2012.0,3.0,2.185
75%,12.5,-73.9684,40.767508,-73.965423,40.768314,2.0,2013.0,5.0,3.961
max,499.0,40.808425,48.01876,40.831932,45.031598,6.0,2015.0,6.0,8782.899


In [167]:
len(df[df.distance_traveled < 0.1])

1081

In [168]:
# removing 100m distance traveled
before = df.shape
df.drop(df[df.distance_traveled < 0.1].index, inplace=True)
print(f"before :{before[0]},after :{df.shape[0]}")

before :194247,after :193166


In [169]:
df_delete_lat_long = df[(df['pickup_longitude']==0) | (df['pickup_latitude']==0) | (df['dropoff_longitude']==0) | (df['dropoff_latitude']==0) ]
df_delete_lat_long

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,Monthly_Quarter,Hourly_Segments,distance_traveled
346,15.50,0.000000,0.000000,-73.979805,40.786030,1,2015,3,Q1,H5,8666.398
1067,52.00,-73.781095,40.645015,0.000000,0.000000,1,2014,6,Q1,H6,8647.036
1526,2.50,-74.001849,40.715156,0.000000,0.000000,3,2014,0,Q2,H4,8666.772
2547,10.10,0.000000,0.000000,-73.953210,40.803528,2,2011,1,Q3,H6,8664.557
3045,15.00,0.000000,0.000000,-73.843777,40.739255,1,2013,1,Q1,H1,8654.177
...,...,...,...,...,...,...,...,...,...,...,...
196967,57.33,0.000000,0.000000,-73.789045,40.655135,2,2014,2,Q3,H3,8647.904
197468,6.90,0.000000,0.000000,-73.980827,40.747133,5,2011,2,Q1,H5,8665.686
197863,7.00,-73.962190,40.759158,0.000000,0.000000,1,2014,1,Q4,H6,8664.389
198567,23.50,-73.968115,40.801455,0.000000,0.000000,2,2013,0,Q4,H1,8665.747


In [170]:
old_shape = df.shape
index_to_delete_lat_long = df[(df['pickup_longitude']==0) | (df['pickup_latitude']==0) | (df['dropoff_longitude']==0) | (df['dropoff_latitude']==0) ].index
df.drop(index_to_delete_lat_long,inplace=True)
print(f"After drop {old_shape[0] - df.shape[0]}")

After drop 379


#### Encoding Categorical Features

In [171]:
Hour_encoder = LabelEncoder()
Month_encoder = LabelEncoder()

df.Monthly_Quarter = Month_encoder.fit_transform(df.Monthly_Quarter)
df.Hourly_Segments = Hour_encoder.fit_transform(df.Hourly_Segments)

Hour_Segments_decoded = Hour_encoder.inverse_transform(df.Hourly_Segments)
print("Hour Segments Decoded Categories:", Hour_Segments_decoded)

Month_Segments_decoded = Month_encoder.inverse_transform(df.Monthly_Quarter)
print("Decoded Categories:", Month_Segments_decoded)

Hour Segments Decoded Categories: ['H5' 'H6' 'H6' ... 'H1' 'H4' 'H2']
Decoded Categories: ['Q2' 'Q3' 'Q3' ... 'Q2' 'Q2' 'Q2']


In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192787 entries, 0 to 199999
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        192787 non-null  float64
 1   pickup_longitude   192787 non-null  float64
 2   pickup_latitude    192787 non-null  float64
 3   dropoff_longitude  192787 non-null  float64
 4   dropoff_latitude   192787 non-null  float64
 5   passenger_count    192787 non-null  int64  
 6   year               192787 non-null  int32  
 7   weekday            192787 non-null  int32  
 8   Monthly_Quarter    192787 non-null  int64  
 9   Hourly_Segments    192787 non-null  int64  
 10  distance_traveled  192787 non-null  float64
dtypes: float64(6), int32(2), int64(3)
memory usage: 16.2 MB


In [173]:
counter = 0
rows,columns = df.shape
df.drop_duplicates(inplace=True)
df.drop(['pickup_latitude','pickup_longitude',
         'dropoff_latitude','dropoff_longitude'],axis=1)
if df.shape==(rows,columns):
    print('Inference: The dataset doesn\'t have any duplicates')
else:
    print(f'Inference: Number of duplicates dropped fixed ---> {rows-df.shape[0]}')

Inference: The dataset doesn't have any duplicates


#### Feature Selection

In [174]:
features =  ['passenger_count', 'year',
             'weekday', 'Monthly_Quarter', 
             'Hourly_Segments', 'distance_traveled']

target = 'fare_amount'

In [175]:
# Splitting Data into X and Y
X, y = df[features], df[target]

In [176]:
selector = SelectKBest(score_func=f_regression, k=6)
X_new = selector.fit_transform(X, y)

selected_feature_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]

print("Selected features:\n")
print('\n'.join(map(str, selected_features)))

Selected features:

passenger_count
year
weekday
Monthly_Quarter
Hourly_Segments
distance_traveled


### Data Splitting into Train and test set, Data Scaling and Model Training

In [177]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=False)

In [178]:
# manual splitting up to 2015 data
train, test = df[df.year < 2015], df[df.year == 2015]

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [179]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (179407, 6)
y_train shape: (179407,)
X_test shape: (13380, 6)
y_test shape: (13380,)


In [180]:
# transforming the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((179407, 6), (179407,), (13380, 6), (13380,))

In [181]:
models_collection = {
    'Random_Forest_1': RandomForestRegressor(),
    'Decision_tree_1': DecisionTreeRegressor(),
    'Linear_Regression_1': LinearRegression(),
    'XGBoost_1': XGBRegressor(),
    'Lasso_1': Lasso(),
    'Gradient_Boost_1' : GradientBoostingRegressor(),
    'High_Gradient_Boost_1' : HistGradientBoostingRegressor(),
}

In [182]:
def eval_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, r2

In [183]:
train_test_results = {}

for model_name, model in tqdm(models_collection.items(), desc='Training Models'):
    model.fit(X_train, y_train.ravel())

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    trained_data_mae, trained_data_mse, trained_data_r2 = eval_model(y_train, y_train_pred)
    test_data_mae, test_data_mse, test_data_r2 = eval_model(y_test, y_test_pred)

    train_test_results[model_name] = {
        'Train': {
            'Train_MAE': trained_data_mae,
            'Train_R2': trained_data_r2,
            'Train_mse': trained_data_mse
        },
        'Test': {
            'Test_MAE': test_data_mae,
            'Test_R2': test_data_r2,
            'Test_mse': test_data_mse
        }
    }

Training Models: 100%|██████████| 7/7 [01:23<00:00, 11.97s/it]


In [184]:
print('-'*65)
print('{:<25s} {:<8s} {:<8s} {:<8s} {:<8s}'.format('Model', 'Train_MSE', 'Test_MSE', 'Train_R2', 'Test_R2'))
print('-'*65)
for model_name, model_results in train_test_results.items():
    print('{:<25s} {:<9.3f} {:<9.3f} {:<9.3f} {:<9.3f}'.format(
        model_name,
        model_results['Train']['Train_mse'],
        model_results['Test']['Test_mse'],
        model_results['Train']['Train_R2'],
        model_results['Test']['Test_R2'],
    ))
print('-'*65)

-----------------------------------------------------------------
Model                     Train_MSE Test_MSE Train_R2 Test_R2 
-----------------------------------------------------------------
Random_Forest_1           2.353     17.872    0.973     0.855    
Decision_tree_1           0.062     42.260    0.999     0.656    
Linear_Regression_1       86.265    122.500   0.018     0.003    
XGBoost_1                 11.418    15.400    0.870     0.875    
Lasso_1                   87.542    124.973   0.004     -0.017   
Gradient_Boost_1          14.090    14.740    0.840     0.880    
High_Gradient_Boost_1     15.251    15.734    0.826     0.872    
-----------------------------------------------------------------


### 2. K-Fold Cross Validation, Data Scaling and Model Training

In [185]:
scalar_1 = StandardScaler()
# taking all features to x fold and y fold as target data
X_fold = scalar_1.fit_transform(df[features])
y_fold = df.fare_amount

In [186]:
Kfold_models = {
    'Random_Forest_2': RandomForestRegressor(),
    'Decision_tree_2': DecisionTreeRegressor(),
    'Linear_Regression_2': LinearRegression(),
    'XGBoost_2': XGBRegressor(),
    'Lasso_2': Lasso(),
    'Gradient_Boost_2' : GradientBoostingRegressor(),
    'High_Gradient_Boost_2' : HistGradientBoostingRegressor(),
}

In [187]:
kfold_results = {}

folds = 5
k_folds = KFold(n_splits=folds)

for model_name, model in tqdm(Kfold_models.items(), desc='Training Models'):
    scores = cross_val_score(model, X_fold, y_fold.ravel(), scoring='r2')
    
    kfold_results[model_name] = {
        'Fold_Scores': scores,
        'Average_Score': np.mean(scores),
}

Training Models: 100%|██████████| 7/7 [05:32<00:00, 47.53s/it] 


In [188]:
print('-'*70)
print('{:<25s} {:<35s} {:<12s}'.format('Model', 'R2 Scores', 'Average R2'))
print('-'*70)
for model_name, model_results in kfold_results.items():
    scores_str = ', '.join([f'{score:.3f}' for score in model_results['Fold_Scores']])
    print('{:<25s} {:<35s} {:<12.3f}'.format(model_name, scores_str, model_results['Average_Score']))
print('-'*70)


----------------------------------------------------------------------
Model                     R2 Scores                           Average R2  
----------------------------------------------------------------------
Random_Forest_2           0.810, 0.827, 0.824, 0.805, 0.819   0.817       
Decision_tree_2           0.639, 0.654, 0.639, 0.649, 0.629   0.642       
Linear_Regression_2       0.018, 0.018, 0.019, 0.018, 0.021   0.019       
XGBoost_2                 0.826, 0.846, 0.842, 0.821, 0.841   0.835       
Lasso_2                   0.004, 0.004, 0.004, 0.005, 0.004   0.004       
Gradient_Boost_2          0.825, 0.848, 0.844, 0.825, 0.841   0.837       
High_Gradient_Boost_2     0.809, 0.835, 0.826, 0.819, 0.829   0.823       
----------------------------------------------------------------------


## Tuning Hyperparameter
Using Optuna for tuning
- tuning Hyper parameters of Xgboost cause <b>XGBOOST IS ALL YOU NEED</b>

In [189]:
import os
import pickle
import logging
def save_object(obj):
    """
        saving the object in a specific path
    """
    try:
        file_path = os.path.join("artifacts", 'model.pkl')
        dir_path = os.path.dirname(file_path)
        os.makedirs(dir_path, exist_ok=True)
        with open(file_path, "wb") as file_obj:
            pickle.dump(obj, file_obj)
        logging.info(f"Object Saved at : {file_path}")
    except Exception as e:
        logging.error(e)

In [190]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 500),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 0.5, 30),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
    }
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test.ravel())

In [191]:
progress_bar = tqdm(total=200, desc='Optimizing Hyperparameter', dynamic_ncols=True)

def callback(study, trial):
    progress_bar.n = len(study.trials)
    progress_bar.update(1)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, callbacks=[callback])

progress_bar.close()

best_params = study.best_params
best_score = study.best_value

model = XGBRegressor(**best_params)

save_object(obj=model)

print("Best Hyperparameter for XGBRegressor:")
print(best_params)
print("Best R2 Score: {:.3f}".format(best_score))

Optimizing Hyperparameter:  32%|███▏      | 64/200 [54:20<1:55:28, 50.95s/it]
[I 2023-08-19 22:25:22,080] A new study created in memory with name: no-name-50e80982-d85f-42da-9914-35382d0f0384
[I 2023-08-19 22:25:30,826] Trial 0 finished with value: 0.8822786004618786 and parameters: {'max_depth': 73, 'subsample': 0.6212236894732276, 'colsample_bytree': 0.7229419886245455, 'colsample_bylevel': 0.4150445217909544, 'min_child_weight': 23, 'reg_lambda': 0.11212666725968802, 'reg_alpha': 0.10056299129821245, 'n_estimators': 206, 'learning_rate': 0.09944578176656922}. Best is trial 0 with value: 0.8822786004618786.
Optimizing Hyperparameter:   1%|          | 2/200 [00:08<14:25,  4.37s/it][I 2023-08-19 22:29:47,215] Trial 1 finished with value: 0.813766930676756 and parameters: {'max_depth': 447, 'subsample': 0.8230052928729708, 'colsample_bytree': 0.9750900777841958, 'colsample_bylevel': 0.8502591373755145, 'min_child_weight': 11, 'reg_lambda': 0.0514006698247006, 'reg_alpha': 0.471081185999

KeyboardInterrupt: 