In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
import time
import datetime

In [2]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [3]:
tripdata.head(3)

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female


In [4]:
tripdata.columns

Index(['index', 'tripduration', 'start_station_id', 'start_station_name',
       'start_lat', 'start_lon', 'end_station_id', 'end_station_name',
       'end_lat', 'end_lon', 'bikeid', 'usertype', 'hour', 'min', 'temp',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'snow',
       'snowdepth', 'windspeed', 'winddir', 'sealevelpressure', 'cloudcover',
       'visibility', 'solarradiation', 'uvindex', 'conditions', 'date', 'dist',
       'birthyear', 'years_old', 'holiday', 'day', 'month', 'seasons',
       'gender'],
      dtype='object')

## Selecting variables according to P values

In [5]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [6]:
df = tripdata[selected_features]

In [7]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

### AMputation with 10% NA values

In [8]:
## get the sample size such as 5%, 10%,20% etc
def get_sample_size(df, perc=1):
    return round(perc/100 * df.shape[0])

def get_an_index(index):
    li = []
    for i in index:
        li.append(i)
    return li   

def ampute_nan_values(X_train,y_train,col_name, perc):
    random.seed(100)
    index = get_an_index(X_train.index)
    sample_size = get_sample_size(X_train, perc)
    print("sample size is =", sample_size)
    selected_index = random.sample(index, sample_size)
    for i in selected_index:
        X_train.loc[i, col_name]= np.NaN
    return X_train, y_train

###====================####
def convert_to_dataframe(y_train):
    return pd.DataFrame(y_train)

def convert_to_series(y_train):
    return y_train.squeeze()

def ampute_nan_values_drop(X_train,y_train,col_name, perc):
    random.seed(100)
    index = get_an_index(X_train.index)
    sample_size = get_sample_size(X_train, perc)
    print("sample size is =", sample_size)
    selected_index = random.sample(index, sample_size)
    y_train = convert_to_dataframe(y_train)
    for i in selected_index:
        y_train.loc[i] = np.NaN
        X_train.loc[i,col_name]= np.NaN
    y_train = convert_to_series(y_train)
    return X_train, y_train

In [9]:
def sample_imputation_X(var,X_train,X_test):
    # extract a random sample
    random_sample_train = X_train[var].dropna().sample(X_train[var].isnull().sum(), random_state=0)
    random_sample_test = X_test[var].dropna().sample(X_test[var].isnull().sum(), random_state=0)
     # re-index the randomly extracted sample
    random_sample_train.index = X_train[X_train[var].isnull()].index
    random_sample_test.index = X_test[X_test[var].isnull()].index

    # replace the NA
    X_train.loc[X_train[var].isnull(), var] = random_sample_train
    X_test.loc[X_test[var].isnull(), var] = random_sample_test
    return X_train, X_test

In [10]:
def ampute_each_variables(variable, method,X_train,y_train,X_test,y_test):
    if method == "remove":
        X_train , y_train = ampute_nan_values_drop(X_train,y_train,variable,perc=40)
        print('Train set after Amputation', X_train.shape, y_train.shape)
        print("y_train Amputed value count of varaible ", y_train.isnull().sum())
        print("X_train Amputed value count of varaible ", X_train.isnull().sum()[variable])
        X_test , y_test = ampute_nan_values_drop(X_test,y_test,variable,perc=40)
        print('Test set after Amputation', X_test.shape, y_test.shape)
        print("y_test Amputed value count of varaible ", y_test.isnull().sum())
        print("X_test Amputed value count of varaible ", X_test.isnull().sum()[variable])
    else:
        X_train , y_train = ampute_nan_values(X_train,y_train,variable,perc=40)
        print('Train set after Amputation', X_train.shape, y_train.shape)
        print("X_train Amputed value count of varaible ", X_train.isnull().sum()[variable])
        X_test , y_test = ampute_nan_values(X_test,y_test,variable,perc=40)
        print('Test set after Amputation', X_test.shape, y_test.shape)
        print("X_test Amputed value count of varaible ", X_test.isnull().sum()[variable])
    start = datetime.datetime.now()
    if method == "remove":
        print("Simulate Imputation for NA")
        X_train.dropna(axis=0, inplace=True)
        y_train.dropna(inplace=True)
        print('Train after NA value removal Imputation', X_train.shape, y_train.shape)
        X_test.dropna(axis=0, inplace=True)
        y_test.dropna(inplace=True)
        print('Test after NA value removal Imputation', X_test.shape, y_test.shape)
    if method == "mode":
        print("Simulate Imputation for Mode")
        X_train.fillna(X_train[variable].mode()[0], inplace=True)
        print('Train after Mode Imputation', X_train.shape, y_train.shape)
        X_test.fillna(X_test[variable].mode()[0], inplace=True)
        print('Test after Mode Imputation', X_test.shape, y_test.shape)
    if method == "mean":
        print("Simulate Imputation for Mean")
        X_train.fillna(X_train[variable].mean(), inplace=True)
        print('Train after Mean Imputation', X_train.shape, y_train.shape)
        X_test.fillna(X_test[variable].mean(), inplace=True)
        print('Test after Mean Imputation', X_test.shape, y_test.shape)
    if method == "sample":
        print("Simple Imputation using sampling")
        X_train, X_test = sample_imputation_X(variable, X_train, X_test)
    end = datetime.datetime.now()
    comp_time = (end-start)
    print(f"Time taken simulate for variable {var} = ", comp_time)
    list_comp_time.append(comp_time)     
            
    return X_train , y_train, X_test , y_test

In [11]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [12]:
def model_evaluation(X_train,y_train):
    print(datetime.datetime.now())
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    print(datetime.datetime.now())
    return model_lr

In [13]:
features_for_amputation = ['start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

## 1. Drop NA values 

In [14]:
model_metrics ={}
list_comp_time = []
for var in features_for_amputation:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"remove",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = model_evaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    model_metrics[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
y_train Amputed value count of varaible  100938
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
y_test Amputed value count of varaible  25235
X_test Amputed value count of varaible  25235
Simulate Imputation for NA
Train after NA value removal Imputation (151408, 21) (151408,)
Test after NA value removal Imputation (37852, 21) (37852,)
Time taken simulate for variable start_lat =  0:00:00.455984
2023-03-25 13:36:45.582402
2023-03-25 13:36:45.828902
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
y_train Amputed value count of varaible  100938
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (6

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
y_train Amputed value count of varaible  100938
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
y_test Amputed value count of varaible  25235
X_test Amputed value count of varaible  25235
Simulate Imputation for NA
Train after NA value removal Imputation (151408, 21) (151408,)
Test after NA value removal Imputation (37852, 21) (37852,)
Time taken simulate for variable visibility =  0:00:00.369323
2023-03-25 13:41:20.034038
2023-03-25 13:41:20.230259
Now running model is .... solarradiation
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
y_train Amputed value count of varaible  100938
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
y_test Ampu

In [15]:
print("averge time to run drop is : ", 0.4170449, "seconds")
list_comp_time

averge time to run drop is :  0.35481595 seconds


[datetime.timedelta(microseconds=455984),
 datetime.timedelta(microseconds=423939),
 datetime.timedelta(microseconds=408448),
 datetime.timedelta(microseconds=434158),
 datetime.timedelta(microseconds=371343),
 datetime.timedelta(microseconds=536144),
 datetime.timedelta(microseconds=442549),
 datetime.timedelta(microseconds=460473),
 datetime.timedelta(microseconds=428222),
 datetime.timedelta(microseconds=398200),
 datetime.timedelta(microseconds=460144),
 datetime.timedelta(microseconds=510421),
 datetime.timedelta(microseconds=369323),
 datetime.timedelta(microseconds=442944),
 datetime.timedelta(microseconds=425152),
 datetime.timedelta(seconds=2, microseconds=669687),
 datetime.timedelta(microseconds=536664),
 datetime.timedelta(microseconds=456456),
 datetime.timedelta(microseconds=399269),
 datetime.timedelta(microseconds=446307),
 datetime.timedelta(microseconds=351801)]

In [16]:
model_metrics

{'start_lat': {'MAE': 1.5718435430420257,
  'MSE': 4.989252368774693,
  'RMSE': 2.23366344124953,
  'R2': 0.5285979071652493},
 'start_lon': {'MAE': 1.5718435430420257,
  'MSE': 4.989252368774693,
  'RMSE': 2.23366344124953,
  'R2': 0.5285979071652493},
 'end_lat': {'MAE': 1.5718435430420257,
  'MSE': 4.989252368774693,
  'RMSE': 2.23366344124953,
  'R2': 0.5285979071652493},
 'end_lon': {'MAE': 1.5718435430420257,
  'MSE': 4.989252368774693,
  'RMSE': 2.23366344124953,
  'R2': 0.5285979071652493},
 'usertype': {'MAE': 1.5718435430420257,
  'MSE': 4.989252368774693,
  'RMSE': 2.23366344124953,
  'R2': 0.5285979071652493},
 'hour': {'MAE': 1.5718435430420257,
  'MSE': 4.989252368774693,
  'RMSE': 2.23366344124953,
  'R2': 0.5285979071652493},
 'temp': {'MAE': 1.5718435430420257,
  'MSE': 4.989252368774693,
  'RMSE': 2.23366344124953,
  'R2': 0.5285979071652493},
 'feelslike': {'MAE': 1.5718435430420257,
  'MSE': 4.989252368774693,
  'RMSE': 2.23366344124953,
  'R2': 0.5285979071652493},

### Mode Replacement

In [17]:
model_metrics_mode ={}
list_comp_time = []
for var in features_for_amputation:
    print("Now model is running for varaible ====", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"mode",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = model_evaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    model_metrics_mode[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now model is running for varaible ==== start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simulate Imputation for Mode
Train after Mode Imputation (252346, 21) (252346,)
Test after Mode Imputation (63087, 21) (63087,)
Time taken simulate for variable start_lat =  0:00:00.321229
2023-03-25 13:44:47.983766
2023-03-25 13:44:48.333793
Now model is running for varaible ==== start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simulate Imputation for Mode
Train after Mode Imputation (252346, 21) (25

sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simulate Imputation for Mode
Train after Mode Imputation (252346, 21) (252346,)
Test after Mode Imputation (63087, 21) (63087,)
Time taken simulate for variable conditions =  0:00:00.493299
2023-03-25 13:48:30.054585
2023-03-25 13:48:30.388380
Now model is running for varaible ==== dist
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simulate Imputation for Mode
Train after Mode Imputation (252346, 21) (252346,)
Test after Mode Imputation (63087, 21) (63087,)
Time taken simulate for variable dist =  0:00:00.4499

In [18]:
print("averge time to run mode is : ", 0.4170359, "seconds")
list_comp_time

averge time to run drop is :  0.35481595 seconds


[datetime.timedelta(microseconds=321229),
 datetime.timedelta(microseconds=473305),
 datetime.timedelta(microseconds=386117),
 datetime.timedelta(microseconds=387178),
 datetime.timedelta(microseconds=423752),
 datetime.timedelta(microseconds=426882),
 datetime.timedelta(microseconds=306598),
 datetime.timedelta(microseconds=408167),
 datetime.timedelta(microseconds=431302),
 datetime.timedelta(microseconds=379381),
 datetime.timedelta(microseconds=487715),
 datetime.timedelta(microseconds=356854),
 datetime.timedelta(microseconds=444141),
 datetime.timedelta(microseconds=328721),
 datetime.timedelta(microseconds=493299),
 datetime.timedelta(microseconds=449982),
 datetime.timedelta(microseconds=439104),
 datetime.timedelta(microseconds=421484),
 datetime.timedelta(microseconds=503663),
 datetime.timedelta(microseconds=414243),
 datetime.timedelta(microseconds=474638)]

In [19]:
model_metrics_mode

{'start_lat': {'MAE': 1.5721555184883094,
  'MSE': 5.000266818358792,
  'RMSE': 2.23612763910265,
  'R2': 0.5271739074737682},
 'start_lon': {'MAE': 1.573616678407497,
  'MSE': 5.007385739067627,
  'RMSE': 2.2377188695337997,
  'R2': 0.526500741104045},
 'end_lat': {'MAE': 1.5722592342381496,
  'MSE': 5.000713592440277,
  'RMSE': 2.2362275359274775,
  'R2': 0.527131660439587},
 'end_lon': {'MAE': 1.5738191643862862,
  'MSE': 5.004428276422789,
  'RMSE': 2.2370579510649224,
  'R2': 0.526780399281688},
 'usertype': {'MAE': 1.584233998072505,
  'MSE': 5.068906250143438,
  'RMSE': 2.2514231610568984,
  'R2': 0.5206833509689042},
 'hour': {'MAE': 1.5749051899787996,
  'MSE': 5.014413172077741,
  'RMSE': 2.2392885414965487,
  'R2': 0.5258362258268877},
 'temp': {'MAE': 1.5725735644807857,
  'MSE': 5.001005303982387,
  'RMSE': 2.2362927590059374,
  'R2': 0.5271040761458661},
 'feelslike': {'MAE': 1.5724330600167897,
  'MSE': 5.000231855200692,
  'RMSE': 2.2361198212977524,
  'R2': 0.527177213

### Mean Replacement

In [20]:
variables_for_mean_amputation = ['start_lat','start_lon','end_lat','end_lon','hour',
                                 'temp','feelslike','dew','snowdepth','winddir','visibility','sealevelpressure',
                                 'solarradiation','dist','birthyear']

In [21]:
model_metrics_mean ={}
list_comp_time = []
for var in variables_for_mean_amputation:
    print("Now model is running for varaible ====", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"mean",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = model_evaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    model_metrics_mean[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now model is running for varaible ==== start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simulate Imputation for Mean
Train after Mean Imputation (252346, 21) (252346,)
Test after Mean Imputation (63087, 21) (63087,)
Time taken simulate for variable start_lat =  0:00:00.453122
2023-03-25 13:50:18.427895
2023-03-25 13:50:18.738727
Now model is running for varaible ==== start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simulate Imputation for Mean
Train after Mean Imputation (252346, 21) (25

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simulate Imputation for Mean
Train after Mean Imputation (252346, 21) (252346,)
Test after Mean Imputation (63087, 21) (63087,)
Time taken simulate for variable birthyear =  0:00:00.547168
2023-03-25 13:54:18.212494
2023-03-25 13:54:18.773064


In [22]:
print("averge time to run mean is : ", 0.4383816, "seconds")
list_comp_time

averge time to run drop is :  0.35481595 seconds


[datetime.timedelta(microseconds=453122),
 datetime.timedelta(microseconds=464613),
 datetime.timedelta(microseconds=392219),
 datetime.timedelta(microseconds=406325),
 datetime.timedelta(microseconds=440854),
 datetime.timedelta(microseconds=429584),
 datetime.timedelta(microseconds=323697),
 datetime.timedelta(microseconds=436015),
 datetime.timedelta(microseconds=408341),
 datetime.timedelta(microseconds=484573),
 datetime.timedelta(microseconds=446756),
 datetime.timedelta(microseconds=444225),
 datetime.timedelta(microseconds=415512),
 datetime.timedelta(microseconds=482721),
 datetime.timedelta(microseconds=547168)]

In [23]:
model_metrics_mean

{'start_lat': {'MAE': 1.5721471578792463,
  'MSE': 5.000139174111262,
  'RMSE': 2.236099097560585,
  'R2': 0.5271859775358241},
 'start_lon': {'MAE': 1.5736120365835713,
  'MSE': 5.007394272816231,
  'RMSE': 2.237720776329395,
  'R2': 0.5264999341513056},
 'end_lat': {'MAE': 1.5722384607635904,
  'MSE': 5.0004343416614585,
  'RMSE': 2.2361650971387284,
  'R2': 0.5271580664413825},
 'end_lon': {'MAE': 1.573764050718548,
  'MSE': 5.004177943378721,
  'RMSE': 2.2370019989661882,
  'R2': 0.5268040708174992},
 'hour': {'MAE': 1.5740042857372634,
  'MSE': 5.008694647906655,
  'RMSE': 2.2380113154107724,
  'R2': 0.526376970458535},
 'temp': {'MAE': 1.5725606180638567,
  'MSE': 5.000980572160991,
  'RMSE': 2.2362872293515856,
  'R2': 0.5271064147911617},
 'feelslike': {'MAE': 1.5724268288254577,
  'MSE': 5.000227668407885,
  'RMSE': 2.2361188851239295,
  'R2': 0.5271776094998766},
 'dew': {'MAE': 1.5722344803550314,
  'MSE': 4.999604181491268,
  'RMSE': 2.235979468038843,
  'R2': 0.52723656653

### Sampling Replacement

In [24]:
model_metrics_sample ={}
list_comp_time = []
for var in features_for_amputation:
    print("Now model is running for varaible ====", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"sample",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = model_evaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    model_metrics_sample[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now model is running for varaible ==== start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simple Imputation using sampling
Time taken simulate for variable start_lat =  0:00:00.054690
2023-03-25 13:54:35.394026
2023-03-25 13:54:35.769837
Now model is running for varaible ==== start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simple Imputation using sampling
Time taken simulate for variable start_lon =  0:00:00.040660
2023-03-25 13:54:49.282165
2023-03-25 13:54:49.641110
Now model is running

2023-03-25 13:58:24.189961
2023-03-25 13:58:24.588688
Now model is running for varaible ==== holiday
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simple Imputation using sampling
Time taken simulate for variable holiday =  0:00:00.100684
2023-03-25 13:58:38.949688
2023-03-25 13:58:39.324355
Now model is running for varaible ==== day
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 100938
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  100938
sample size is = 25235
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  25235
Simple Imputation using sampling
Time taken simulate for variable day =  0:00:00.086517
2023-03-25 13:58:51.756981
2023-03-2

In [25]:
print("averge time to run sample is : ", 0.739964, "seconds")
list_comp_time

averge time to run drop is :  0.35481595 seconds


[datetime.timedelta(microseconds=54690),
 datetime.timedelta(microseconds=40660),
 datetime.timedelta(microseconds=45014),
 datetime.timedelta(microseconds=38003),
 datetime.timedelta(microseconds=83547),
 datetime.timedelta(microseconds=69562),
 datetime.timedelta(microseconds=61713),
 datetime.timedelta(microseconds=38983),
 datetime.timedelta(microseconds=65811),
 datetime.timedelta(microseconds=63192),
 datetime.timedelta(microseconds=70184),
 datetime.timedelta(microseconds=69513),
 datetime.timedelta(microseconds=59073),
 datetime.timedelta(microseconds=56606),
 datetime.timedelta(microseconds=129767),
 datetime.timedelta(microseconds=50348),
 datetime.timedelta(microseconds=118740),
 datetime.timedelta(microseconds=100684),
 datetime.timedelta(microseconds=86517),
 datetime.timedelta(microseconds=140604),
 datetime.timedelta(microseconds=110714)]

In [26]:
model_metrics_sample

{'start_lat': {'MAE': 1.5720913518611388,
  'MSE': 5.000163820032112,
  'RMSE': 2.2361046084725356,
  'R2': 0.5271836470132989},
 'start_lon': {'MAE': 1.5738936507082024,
  'MSE': 5.008726592925765,
  'RMSE': 2.2380184523202136,
  'R2': 0.5263739497320197},
 'end_lat': {'MAE': 1.5721622707824663,
  'MSE': 5.001062165281423,
  'RMSE': 2.2363054722647853,
  'R2': 0.5270986993316248},
 'end_lon': {'MAE': 1.5742167112695697,
  'MSE': 5.006080348072839,
  'RMSE': 2.23742717156846,
  'R2': 0.5266241791015971},
 'usertype': {'MAE': 1.5903511119131941,
  'MSE': 5.100343832021567,
  'RMSE': 2.2583940825333313,
  'R2': 0.5177106078058127},
 'hour': {'MAE': 1.5750620753973457,
  'MSE': 5.015437742448758,
  'RMSE': 2.239517301216661,
  'R2': 0.5257393422759404},
 'temp': {'MAE': 1.572590351524022,
  'MSE': 5.001011644715819,
  'RMSE': 2.2362941766940723,
  'R2': 0.5271034765650195},
 'feelslike': {'MAE': 1.5724303507581607,
  'MSE': 5.000219551996955,
  'RMSE': 2.2361170702798536,
  'R2': 0.527178