In [37]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
import time
import datetime

In [38]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [39]:
tripdata.head(3)

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female


In [40]:
tripdata.columns

Index(['index', 'tripduration', 'start_station_id', 'start_station_name',
       'start_lat', 'start_lon', 'end_station_id', 'end_station_name',
       'end_lat', 'end_lon', 'bikeid', 'usertype', 'hour', 'min', 'temp',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'snow',
       'snowdepth', 'windspeed', 'winddir', 'sealevelpressure', 'cloudcover',
       'visibility', 'solarradiation', 'uvindex', 'conditions', 'date', 'dist',
       'birthyear', 'years_old', 'holiday', 'day', 'month', 'seasons',
       'gender'],
      dtype='object')

## Selecting variables according to P values

In [41]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [42]:
df = tripdata[selected_features]

In [43]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

### AMputation with 5% NA values

In [44]:
## get the sample size such as 5%, 10%,20% etc
def getSampleSize(df, perc=1):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li   

def induceMissingValues(X_train,col_name, perc):
    random.seed(100)
    index = getAnIndex(X_train.index)
    sample_size = getSampleSize(X_train, perc)
    print("sample size is =", sample_size)
    selected_index = random.sample(index, sample_size)
    for i in selected_index:
        X_train.loc[i, col_name]= np.NaN
    return X_train

###====================####
def convert_to_dataframe(y_train):
    return pd.DataFrame(y_train)

def convert_to_series(y_train):
    return y_train.squeeze()

def induceMissingValuesForDrop(X_train,y_train,col_name, perc):
    random.seed(100)
    index = getAnIndex(X_train.index)
    sample_size = getSampleSize(X_train, perc)
    print("sample size is =", sample_size)
    selected_index = random.sample(index, sample_size)
    y_train = convert_to_dataframe(y_train)
    for i in selected_index:
        y_train.loc[i] = np.NaN
        X_train.loc[i,col_name]= np.NaN
    y_train = convert_to_series(y_train)
    return X_train, y_train

In [45]:
def sample_imputation_X(var,X_train,X_test):
    # extract a random sample
    random_sample_train = X_train[var].dropna().sample(X_train[var].isnull().sum(), random_state=0)
    random_sample_test = X_test[var].dropna().sample(X_test[var].isnull().sum(), random_state=0)
     # re-index the randomly extracted sample
    random_sample_train.index = X_train[X_train[var].isnull()].index
    random_sample_test.index = X_test[X_test[var].isnull()].index

    # replace the NA
    X_train.loc[X_train[var].isnull(), var] = random_sample_train
    X_test.loc[X_test[var].isnull(), var] = random_sample_test
    return X_train, X_test

In [46]:
fill_dict ={'start_lat': 40.79865054, 'start_lon': -74.2404433, 'end_lat': 40.79165072 , 'end_lon':-74.28593088,'usertype':'agent',
            'hour': 24, 'temp': 38.6, 'feelslike':49.4,'dew':-25.7,'snowdepth':5.10,'winddir':380.0,'sealevelpressure':819.3,
            'visibility':7.2,'solarradiation': 1109,'conditions': 'muddy','dist': 5.345,'birthyear': 2003,
            'holiday':'unknown','day': 'special','month': 'unknown','gender': 'nonbinary'
        }

In [47]:
def ampute_each_variables(variable, method, X_train, y_train, X_test, y_test):
    if method == "remove":
        X_train , y_train = induceMissingValuesForDrop(X_train,y_train,variable,perc=5)
        print('Train set after Amputation', X_train.shape, y_train.shape)
        print("y_train Amputed value count of varaible ", y_train.isnull().sum())
        print("X_train Amputed value count of varaible ", X_train.isnull().sum()[variable])
        X_test , y_test = induceMissingValuesForDrop(X_test,y_test,variable,perc=5)
        print('Test set after Amputation', X_test.shape, y_test.shape)
        print("y_test Amputed value count of varaible ", y_test.isnull().sum())
        print("X_test Amputed value count of varaible ", X_test.isnull().sum()[variable])
    else:
        X_train  = induceMissingValues(X_train,variable,perc=5)
        print('Train set after Amputation', X_train.shape, y_train.shape)
        print("X_train Amputed value count of varaible ", X_train.isnull().sum()[variable])
        X_test  = induceMissingValues(X_test,variable,perc=5)
        print('Test set after Amputation', X_test.shape, y_test.shape)
        print("X_test Amputed value count of varaible ", X_test.isnull().sum()[variable])
    
    if method == "remove":
        print("Simulate Imputation for NA")
        X_train.dropna(axis=0, inplace=True)
        y_train.dropna(inplace=True)
        print('Train after NA value removal Imputation', X_train.shape, y_train.shape)
        X_test.dropna(axis=0, inplace=True)
        y_test.dropna(inplace=True)
        print('Test after NA value removal Imputation', X_test.shape, y_test.shape)
    if method == "mode":
        print("Simulate Imputation for Mode")
        X_train.fillna(X_train[variable].mode()[0], inplace=True)
        print('Train after Mode Imputation', X_train.shape, y_train.shape)
        X_test.fillna(X_test[variable].mode()[0], inplace=True)
        print('Test after Mode Imputation', X_test.shape, y_test.shape)
    if method == "mean":
        print("Simulate Imputation for Mean")
        X_train.fillna(X_train[variable].mean(), inplace=True)
        print('Train after Mean Imputation', X_train.shape, y_train.shape)
        X_test.fillna(X_test[variable].mean(), inplace=True)
        print('Test after Mean Imputation', X_test.shape, y_test.shape)
    if method == "sample":
        print("Simple Imputation using sampling")
        X_train, X_test = sample_imputation_X(variable, X_train, X_test)
    if method == "bad":
        print("Simulate Imputation for BAD")
        X_train.fillna(fill_dict[variable], inplace=True)
        print('Train  after imputation', X_train.shape, y_train.shape)
        X_test.fillna(fill_dict[variable], inplace=True)
        print('Test  after imputation', X_test.shape, y_test.shape)
            
    return X_train , y_train, X_test , y_test

In [48]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [49]:
def model_evaluation(X_train,y_train):
    print(datetime.datetime.now())
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    print(datetime.datetime.now())
    return model_lr

In [50]:
features_for_amputation = ['start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

## 1. Drop NA values 

In [28]:
bootstrap_data_drop ={}
for var in features_for_amputation:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"remove",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    X_train = convert_to_dataframe(X_train)
    X_test = convert_to_dataframe(X_test)
    model_lr = LinearRegression(fit_intercept=True)
    rng = np.random.RandomState(1234)
    idx_train = np.arange(y_train.shape[0])
    idx_test = np.arange(y_test.shape[0])
    bootstarp_train_accuracies = {}
    print(f"started bootstrap run is ", datetime.datetime.now())
    for i in range(501):
        train_idx = rng.choice(idx_train, size=idx_train.shape[0], replace=True)
        test_idx = rng.choice(idx_test, size=idx_test.shape[0], replace=True)

        boot_train_X, boot_train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]

        boot_test_X, boot_test_y = X_test.iloc[test_idx], y_test.iloc[test_idx]

        model_lr.fit(boot_test_X, boot_test_y )
        boot_y_pred = model_lr.predict(boot_test_X)
        bootstarp_train_accuracies[i] = {"RMSE": np.sqrt(mean_squared_error(boot_test_y, boot_y_pred)),
                                      "R2":r2_score(boot_test_y, boot_y_pred), "intercept":model_lr.intercept_}
    print(f"stopped bootstrap run is ", datetime.datetime.now())   
    R2 = []
    RMSE = []
    intercept = []
    for i in range(501):
        R2.append(bootstarp_train_accuracies[i]['R2'])
        RMSE.append(bootstarp_train_accuracies[i]['RMSE'])
        intercept.append(bootstarp_train_accuracies[i]['intercept'])
    bootstrap_data_drop[var] = {"R2": [np.percentile(R2, 2.5), np.percentile(R2, 97.5)],
                         "RMSE": [np.percentile(RMSE, 2.5), np.percentile(RMSE, 97.5)],
                         "intercept": [np.percentile(intercept, 2.5), np.percentile(intercept, 97.5)]}

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
y_train Amputed value count of varaible  12617
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
y_test Amputed value count of varaible  3154
X_test Amputed value count of varaible  3154
Simulate Imputation for NA
Train after NA value removal Imputation (239729, 21) (239729,)
Test after NA value removal Imputation (59933, 21) (59933,)
started bootstrap run is  2023-03-19 18:32:56.814867
stopped bootstrap run is  2023-03-19 18:34:17.928673
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
y_train Amputed value count of varaible  12617
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)


X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
y_test Amputed value count of varaible  3154
X_test Amputed value count of varaible  3154
Simulate Imputation for NA
Train after NA value removal Imputation (239729, 21) (239729,)
Test after NA value removal Imputation (59933, 21) (59933,)
started bootstrap run is  2023-03-19 18:50:28.529330
stopped bootstrap run is  2023-03-19 18:51:55.443995
Now running model is .... solarradiation
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
y_train Amputed value count of varaible  12617
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
y_test Amputed value count of varaible  3154
X_test Amputed value count of varaible  3154
Simulate Imputation for NA
Train after NA value removal Imputation (239729, 21) (239729,)
Test after NA value remova

In [29]:
bootstrap_data_drop

{'start_lat': {'R2': [0.5193583566525888, 0.5354949471218593],
  'RMSE': [2.2093593522529327, 2.2602014167673676],
  'intercept': [-675.3220216420204, -357.37577535847026]},
 'start_lon': {'R2': [0.5193583566525888, 0.5354949471218593],
  'RMSE': [2.2093593522529327, 2.2602014167673676],
  'intercept': [-675.3220216420204, -357.37577535847026]},
 'end_lat': {'R2': [0.5193583566525888, 0.5354949471218593],
  'RMSE': [2.2093593522529327, 2.2602014167673676],
  'intercept': [-675.3220216420204, -357.37577535847026]},
 'end_lon': {'R2': [0.5193583566525888, 0.5354949471218593],
  'RMSE': [2.2093593522529327, 2.2602014167673676],
  'intercept': [-675.3220216420204, -357.37577535847026]},
 'usertype': {'R2': [0.5193583566525888, 0.5354949471218593],
  'RMSE': [2.2093593522529327, 2.2602014167673676],
  'intercept': [-675.3220216420204, -357.37577535847026]},
 'hour': {'R2': [0.5193583566525888, 0.5354949471218593],
  'RMSE': [2.2093593522529327, 2.2602014167673676],
  'intercept': [-675.3220

### Mode Replacement

In [30]:
bootstrap_data_mode ={}
for var in features_for_amputation:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"mode",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    X_train = convert_to_dataframe(X_train)
    X_test = convert_to_dataframe(X_test)
    model_lr = LinearRegression(fit_intercept=True)
    rng = np.random.RandomState(1234)
    idx_train = np.arange(y_train.shape[0])
    idx_test = np.arange(y_test.shape[0])
    bootstarp_train_accuracies = {}
    print(f"started bootstrap run is ", datetime.datetime.now())
    for i in range(501):
        train_idx = rng.choice(idx_train, size=idx_train.shape[0], replace=True)
        test_idx = rng.choice(idx_test, size=idx_test.shape[0], replace=True)

        boot_train_X, boot_train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]

        boot_test_X, boot_test_y = X_test.iloc[test_idx], y_test.iloc[test_idx]

        model_lr.fit(boot_test_X, boot_test_y )
        boot_y_pred = model_lr.predict(boot_test_X)
        bootstarp_train_accuracies[i] = {"RMSE": np.sqrt(mean_squared_error(boot_test_y, boot_y_pred)),
                                      "R2":r2_score(boot_test_y, boot_y_pred), "intercept":model_lr.intercept_}
    print(f"stopped bootstrap run is ", datetime.datetime.now())  
    
    R2 = []
    RMSE = []
    intercept = []
    for i in range(501):
        R2.append(bootstarp_train_accuracies[i]['R2'])
        RMSE.append(bootstarp_train_accuracies[i]['RMSE'])
        intercept.append(bootstarp_train_accuracies[i]['intercept'])
    bootstrap_data_mode[var] = {"R2": [np.percentile(R2, 2.5), np.percentile(R2, 97.5)],
                         "RMSE": [np.percentile(RMSE, 2.5), np.percentile(RMSE, 97.5)],
                         "intercept": [np.percentile(intercept, 2.5), np.percentile(intercept, 97.5)]}

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for Mode
Train after Mode Imputation (252346, 21) (252346,)
Test after Mode Imputation (63087, 21) (63087,)
started bootstrap run is  2023-03-19 19:04:28.301284
stopped bootstrap run is  2023-03-19 19:06:15.085945
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for Mode
Train after Mode Imputation (252346, 21) (252346,)
Test after Mode Imputation (63087, 2

started bootstrap run is  2023-03-19 19:27:33.581869
stopped bootstrap run is  2023-03-19 19:29:25.697144
Now running model is .... dist
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for Mode
Train after Mode Imputation (252346, 21) (252346,)
Test after Mode Imputation (63087, 21) (63087,)
started bootstrap run is  2023-03-19 19:29:29.058206
stopped bootstrap run is  2023-03-19 19:30:58.713691
Now running model is .... birthyear
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputa

In [31]:
bootstrap_data_mode

{'start_lat': {'R2': [0.5196966891449158, 0.5358224977617596],
  'RMSE': [2.2082401957947155, 2.2557015649284446],
  'intercept': [-652.3867723642029, -361.50551067170284]},
 'start_lon': {'R2': [0.5197232562474134, 0.5358441818351345],
  'RMSE': [2.2081612382746005, 2.255623657435196],
  'intercept': [-627.0549636280718, -328.2646572252355]},
 'end_lat': {'R2': [0.5197413072836579, 0.5358180360779907],
  'RMSE': [2.208229621681374, 2.255618822445091],
  'intercept': [-660.1446645009394, -369.08462922363213]},
 'end_lon': {'R2': [0.5197161805742122, 0.5357618170661198],
  'RMSE': [2.208397164897356, 2.2558006352928053],
  'intercept': [-679.8307001180228, -388.3938416647466]},
 'usertype': {'R2': [0.5187876557463108, 0.5349024594881018],
  'RMSE': [2.2110386524112275, 2.257986470528337],
  'intercept': [-648.5949412429218, -365.45309617577584]},
 'hour': {'R2': [0.5194551584918838, 0.5354731660884473],
  'RMSE': [2.209093309687533, 2.2562314830586],
  'intercept': [-648.7474237951167, 

### Mean Replacement

In [32]:
variables_for_mean_amputation = ['start_lat','start_lon','end_lat','end_lon','hour',
                                 'temp','feelslike','snowdepth','winddir','visibility','sealevelpressure',
                                 'solarradiation','dist','birthyear']

bootstrap_data_mean = {}
for var in variables_for_mean_amputation:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"mean",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    X_train = convert_to_dataframe(X_train)
    X_test = convert_to_dataframe(X_test)
    model_lr = LinearRegression(fit_intercept=True)
    rng = np.random.RandomState(1234)
    idx_train = np.arange(y_train.shape[0])
    idx_test = np.arange(y_test.shape[0])
    bootstarp_train_accuracies = {}
    print(f"started bootstrap run is ", datetime.datetime.now())
    for i in range(501):
        train_idx = rng.choice(idx_train, size=idx_train.shape[0], replace=True)
        test_idx = rng.choice(idx_test, size=idx_test.shape[0], replace=True)

        boot_train_X, boot_train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]

        boot_test_X, boot_test_y = X_test.iloc[test_idx], y_test.iloc[test_idx]

        model_lr.fit(boot_test_X, boot_test_y )
        boot_y_pred = model_lr.predict(boot_test_X)
        bootstarp_train_accuracies[i] = {"RMSE": np.sqrt(mean_squared_error(boot_test_y, boot_y_pred)),
                                      "R2":r2_score(boot_test_y, boot_y_pred), "intercept":model_lr.intercept_}
    print(f"stopped bootstrap run is ", datetime.datetime.now())  
    R2 = []
    RMSE = []
    intercept = []
    for i in range(501):
        R2.append(bootstarp_train_accuracies[i]['R2'])
        RMSE.append(bootstarp_train_accuracies[i]['RMSE'])
        intercept.append(bootstarp_train_accuracies[i]['intercept'])
    bootstrap_data_mean[var] = {"R2": [np.percentile(R2, 2.5), np.percentile(R2, 97.5)],
                         "RMSE": [np.percentile(RMSE, 2.5), np.percentile(RMSE, 97.5)],
                         "intercept": [np.percentile(intercept, 2.5), np.percentile(intercept, 97.5)]}

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for Mean
Train after Mean Imputation (252346, 21) (252346,)
Test after Mean Imputation (63087, 21) (63087,)
started bootstrap run is  2023-03-19 19:38:47.969327
stopped bootstrap run is  2023-03-19 19:40:29.707855
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for Mean
Train after Mean Imputation (252346, 21) (252346,)
Test after Mean Imputation (63087, 2

In [33]:
bootstrap_data_mean

{'start_lat': {'R2': [0.5197031582460789, 0.5358294384994136],
  'RMSE': [2.2082019984010794, 2.2556911086836386],
  'intercept': [-654.4430596171658, -365.0756293824451]},
 'start_lon': {'R2': [0.5197201025168072, 0.5358424921641616],
  'RMSE': [2.2081781404953365, 2.2556216186474796],
  'intercept': [-625.5296395490195, -328.6577146481641]},
 'end_lat': {'R2': [0.5197649878773352, 0.5358292909773931],
  'RMSE': [2.208183608172015, 2.2556067096784833],
  'intercept': [-663.4064047921456, -372.92885870377796]},
 'end_lon': {'R2': [0.5197311069597721, 0.5357779694978122],
  'RMSE': [2.2083416353153007, 2.2557894367809244],
  'intercept': [-683.6323026825942, -390.455325191927]},
 'hour': {'R2': [0.519656428983695, 0.5356958112610921],
  'RMSE': [2.2084353229858493, 2.2558128629425984],
  'intercept': [-645.4571831183515, -365.59810317685196]},
 'temp': {'R2': [0.5198262392598926, 0.5358367014464007],
  'RMSE': [2.2083528261024155, 2.2556431836262467],
  'intercept': [-646.3342929179712,

### Sampling Replacement

In [34]:
bootstrap_data_sample ={}
for var in features_for_amputation:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"sample",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    X_train = convert_to_dataframe(X_train)
    X_test = convert_to_dataframe(X_test)
    model_lr = LinearRegression(fit_intercept=True)
    rng = np.random.RandomState(1234)
    idx_train = np.arange(y_train.shape[0])
    idx_test = np.arange(y_test.shape[0])
    bootstarp_train_accuracies = {}
    print(f"started bootstrap run is ", datetime.datetime.now())
    
    for i in range(501):
        train_idx = rng.choice(idx_train, size=idx_train.shape[0], replace=True)
        test_idx = rng.choice(idx_test, size=idx_test.shape[0], replace=True)

        boot_train_X, boot_train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]

        boot_test_X, boot_test_y = X_test.iloc[test_idx], y_test.iloc[test_idx]

        model_lr.fit(boot_test_X, boot_test_y )
        boot_y_pred = model_lr.predict(boot_test_X)
        bootstarp_train_accuracies[i] = {"RMSE": np.sqrt(mean_squared_error(boot_test_y, boot_y_pred)),
                                      "R2":r2_score(boot_test_y, boot_y_pred), "intercept":model_lr.intercept_}
    print(f"stopped bootstrap run is ", datetime.datetime.now())  
    
    R2 = []
    RMSE = []
    intercept = []
    for i in range(501):
        R2.append(bootstarp_train_accuracies[i]['R2'])
        RMSE.append(bootstarp_train_accuracies[i]['RMSE'])
        intercept.append(bootstarp_train_accuracies[i]['intercept'])
    bootstrap_data_sample[var] = {"R2": [np.percentile(R2, 2.5), np.percentile(R2, 97.5)],
                         "RMSE": [np.percentile(RMSE, 2.5), np.percentile(RMSE, 97.5)],
                         "intercept": [np.percentile(intercept, 2.5), np.percentile(intercept, 97.5)]}

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simple Imputation using sampling
started bootstrap run is  2023-03-19 20:01:26.402104
stopped bootstrap run is  2023-03-19 20:02:55.399038
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simple Imputation using sampling
started bootstrap run is  2023-03-19 20:02:58.437037
stopped bootstrap run is  2023-03-19 20:04:27.043424
Now running model is .... end_lat
Train (252346, 21) (252346,)
Test (630

started bootstrap run is  2023-03-19 20:27:53.856147
stopped bootstrap run is  2023-03-19 20:29:42.947447
Now running model is .... day
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simple Imputation using sampling
started bootstrap run is  2023-03-19 20:29:45.873066
stopped bootstrap run is  2023-03-19 20:31:15.649203
Now running model is .... month
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simple Imputation using sampling
started bootstrap run is  2023-03-19 20:31:18.711039
stopped bootstrap run is  202

In [35]:
bootstrap_data_sample

{'start_lat': {'R2': [0.5196782236432508, 0.5358072698210881],
  'RMSE': [2.2082467165578543, 2.2557305818183444],
  'intercept': [-645.1072726943064, -355.14515083362534]},
 'start_lon': {'R2': [0.5197333766239003, 0.5358223604789614],
  'RMSE': [2.2082916152938417, 2.2557438635080445],
  'intercept': [-648.8582175729136, -350.0513039497289]},
 'end_lat': {'R2': [0.5197230572300315, 0.5358031613395056],
  'RMSE': [2.208302823876297, 2.2557048113953804],
  'intercept': [-654.7086016274479, -359.9159190059273]},
 'end_lon': {'R2': [0.5196386665438684, 0.5357320689442884],
  'RMSE': [2.2085299353168915, 2.2559673914292953],
  'intercept': [-661.8554419367782, -366.38305489882214]},
 'usertype': {'R2': [0.5180607074595311, 0.5343375072847507],
  'RMSE': [2.2126528703305537, 2.259613573342188],
  'intercept': [-640.9116058163528, -359.3644067186434]},
 'hour': {'R2': [0.5194888056224659, 0.5355097607471726],
  'RMSE': [2.208885601822524, 2.256248146490078],
  'intercept': [-646.43129469981

### Bad Imputation

In [51]:
bootstrap_data_bad ={}
for var in features_for_amputation:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"bad",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    X_train = convert_to_dataframe(X_train)
    X_test = convert_to_dataframe(X_test)
    model_lr = LinearRegression(fit_intercept=True)
    rng = np.random.RandomState(1234)
    idx_train = np.arange(y_train.shape[0])
    idx_test = np.arange(y_test.shape[0])
    bootstarp_train_accuracies = {}
    print(f"started bootstrap run is ", datetime.datetime.now())
    
    for i in range(501):
        train_idx = rng.choice(idx_train, size=idx_train.shape[0], replace=True)
        test_idx = rng.choice(idx_test, size=idx_test.shape[0], replace=True)

        boot_train_X, boot_train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]

        boot_test_X, boot_test_y = X_test.iloc[test_idx], y_test.iloc[test_idx]

        model_lr.fit(boot_test_X, boot_test_y )
        boot_y_pred = model_lr.predict(boot_test_X)
        bootstarp_train_accuracies[i] = {"RMSE": np.sqrt(mean_squared_error(boot_test_y, boot_y_pred)),
                                      "R2":r2_score(boot_test_y, boot_y_pred), "intercept":model_lr.intercept_}
    print(f"stopped bootstrap run is ", datetime.datetime.now())  
    
    R2 = []
    RMSE = []
    intercept = []
    for i in range(501):
        R2.append(bootstarp_train_accuracies[i]['R2'])
        RMSE.append(bootstarp_train_accuracies[i]['RMSE'])
        intercept.append(bootstarp_train_accuracies[i]['intercept'])
    bootstrap_data_bad[var] = {"R2": [np.percentile(R2, 2.5), np.percentile(R2, 97.5)],
                         "RMSE": [np.percentile(RMSE, 2.5), np.percentile(RMSE, 97.5)],
                         "intercept": [np.percentile(intercept, 2.5), np.percentile(intercept, 97.5)]}

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for BAD
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
started bootstrap run is  2023-03-21 14:02:03.447918
stopped bootstrap run is  2023-03-21 14:04:27.256198
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for BAD
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
starte

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for BAD
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
started bootstrap run is  2023-03-21 14:30:43.810148
stopped bootstrap run is  2023-03-21 14:32:21.698188
Now running model is .... birthyear
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
sample size is = 12617
Train set after Amputation (252346, 21) (252346,)
X_train Amputed value count of varaible  12617
sample size is = 3154
Test set after Amputation (63087, 21) (63087,)
X_test Amputed value count of varaible  3154
Simulate Imputation for BAD
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
started bootstrap run is  2023-03-21 14:32

In [52]:
bootstrap_data_bad

{'start_lat': {'R2': [0.5196583329371154, 0.5357682586672661],
  'RMSE': [2.20839309198837, 2.2558680395679387],
  'intercept': [-594.7160289927955, -309.67015010643115]},
 'start_lon': {'R2': [0.5187012683024217, 0.5349348833769243],
  'RMSE': [2.210549897851905, 2.2582630908569725],
  'intercept': [-928.3616948256048, -649.6632190414645]},
 'end_lat': {'R2': [0.5195710329421586, 0.5356581951375081],
  'RMSE': [2.208639945032872, 2.2561889090887615],
  'intercept': [-555.2203445912039, -273.1447415342349]},
 'end_lon': {'R2': [0.5188061283043195, 0.5349832698683099],
  'RMSE': [2.210145716118359, 2.257795419844994],
  'intercept': [-428.1959276417599, -152.28745223782408]},
 'usertype': {'R2': [0.5188343974349726, 0.5349575603383914],
  'RMSE': [2.2107721480086715, 2.2579217863323002],
  'intercept': [-648.3568407560808, -366.1655023437937]},
 'hour': {'R2': [0.5193416388946419, 0.5353094506907072],
  'RMSE': [2.2092774644379496, 2.25674647175049],
  'intercept': [-649.2603341296058, 