In [15]:
import pandas as pd
import numpy as np
import random
import time
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

from sklearn.preprocessing import OneHotEncoder


In [16]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [17]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [18]:
df = tripdata[selected_features]

In [19]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [20]:
def getSampleSize(df, perc=1):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  
        
random.seed(100)
def induceMissingValues(X_train,col_name, perc):
    random.seed(100)
    index = getAnIndex(X_train.index)
    sample_size = getSampleSize(X_train, perc)
    selected_index = random.sample(index, sample_size)
    for i in selected_index:
        if col_name in X_train.columns:
            X_train.at[i,col_name]= np.NaN
        else:
            X_train.loc[i] = np.NaN
    return X_train


In [21]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [22]:
def modelEvaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [23]:
fill_dict ={'start_lat': 40.79865054, 'start_lon': -74.2404433, 'end_lat': 40.79165072 , 'end_lon':-74.28593088,'usertype':'agent',
            'hour': 24, 'temp': 38.6, 'feelslike':49.4,'dew':-25.7,'snowdepth':5.10,'winddir':380.0,'sealevelpressure':819.3,
            'visibility':7.2,'solarradiation': 1109,'conditions': 'muddy','dist': 5.345,'birthyear': 2003,
            'holiday':'unknown','day': 'special','month': 'unknown','gender': 'nonbinary'
        }

In [24]:
def ampute_each_variables(variable, method, X_train, y_train, X_test, y_test):
    X_train  = induceMissingValues(X_train,variable,perc=10)
    print('Train set after Amputation', X_train.shape, y_train.shape)
    print("X_train missing value count of varaible ", X_train.isnull().sum()[variable])
    X_test  = induceMissingValues(X_test,variable,perc=10)
    print('Test set after Amputation', X_test.shape, y_test.shape)
    print("X_test missing value count of varaible ", X_test.isnull().sum()[variable])
    start = datetime.datetime.now() 
    if method == "bad":
        print("Simulate Imputation for NA")
        X_train.fillna(fill_dict[variable], inplace=True)
        print('Train  after imputation', X_train.shape, y_train.shape)
        X_test.fillna(fill_dict[variable], inplace=True)
        print('Test  after imputation', X_test.shape, y_test.shape)
    end = datetime.datetime.now()
    comp_time = (end-start)
    print(f"Time taken simulate for variable {var} = ", comp_time)
    list_comp_time.append(comp_time)
    return X_train , y_train, X_test , y_test

In [25]:
variables = ['start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [26]:
result = {}
list_comp_time =[]
for var in variables:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"bad",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = modelEvaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    result[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  25235
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  6309
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lat =  0:00:00.254737
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  25235
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  6309
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lon =  0:00:00.275572
Now running model is .... end_lat
Train (252346, 21) (252346,)
Test (63087, 21) (6

Now running model is .... day
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  25235
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  6309
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable day =  0:00:00.269923
Now running model is .... month
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  25235
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  6309
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable month =  0:00:00.250176
Now running model is .... gender
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set afte

In [27]:
print("averge time to run drop is : ", 0.283314.7619)
list_comp_time

averge time to run drop is :  4.117488


[datetime.timedelta(microseconds=254737),
 datetime.timedelta(microseconds=275572),
 datetime.timedelta(microseconds=251984),
 datetime.timedelta(microseconds=244787),
 datetime.timedelta(microseconds=274983),
 datetime.timedelta(microseconds=358884),
 datetime.timedelta(microseconds=309889),
 datetime.timedelta(microseconds=297828),
 datetime.timedelta(microseconds=276455),
 datetime.timedelta(microseconds=273948),
 datetime.timedelta(microseconds=321664),
 datetime.timedelta(microseconds=255113),
 datetime.timedelta(microseconds=309925),
 datetime.timedelta(microseconds=333591),
 datetime.timedelta(microseconds=336250),
 datetime.timedelta(microseconds=292924),
 datetime.timedelta(microseconds=311910),
 datetime.timedelta(microseconds=222024),
 datetime.timedelta(microseconds=269923),
 datetime.timedelta(microseconds=250176),
 datetime.timedelta(microseconds=227043)]

In [28]:
result

{'start_lat': {'MAE': 1.5720494510379117,
  'MSE': 5.000746224651677,
  'RMSE': 2.236234832179232,
  'R2': 0.5271285747320503},
 'start_lon': {'MAE': 1.5741888260258632,
  'MSE': 5.010694738378818,
  'RMSE': 2.238458116288714,
  'R2': 0.5261878415586172},
 'end_lat': {'MAE': 1.5722966067591826,
  'MSE': 5.002762549148604,
  'RMSE': 2.2366856169673475,
  'R2': 0.5269379107399517},
 'end_lon': {'MAE': 1.5750478241370465,
  'MSE': 5.008927464419617,
  'RMSE': 2.2380633289564478,
  'R2': 0.5263549552889253},
 'usertype': {'MAE': 1.57571121018561,
  'MSE': 5.021296046451434,
  'RMSE': 2.240824858495513,
  'R2': 0.5251853800393163},
 'hour': {'MAE': 1.5740976714484938,
  'MSE': 5.009459937899822,
  'RMSE': 2.238182284332494,
  'R2': 0.5263046045048242},
 'temp': {'MAE': 1.57259996884339,
  'MSE': 5.001073886093155,
  'RMSE': 2.2363080928380943,
  'R2': 0.5270975910096465},
 'feelslike': {'MAE': 1.572440498034621,
  'MSE': 5.0002763833688135,
  'RMSE': 2.236129777845824,
  'R2': 0.52717300300