In [2]:
import pandas as pd
import numpy as np
import random
import time
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

from sklearn.preprocessing import OneHotEncoder


In [3]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [4]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [5]:
df = tripdata[selected_features]

In [6]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [7]:
def getSampleSize(df, perc=1):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  
        
random.seed(100)
def induceMissingValues(X_train,col_name, perc):
    random.seed(100)
    index = getAnIndex(X_train.index)
    sample_size = getSampleSize(X_train, perc)
    selected_index = random.sample(index, sample_size)
    for i in selected_index:
        if col_name in X_train.columns:
            X_train.at[i,col_name]= np.NaN
        else:
            X_train.loc[i] = np.NaN
    return X_train


In [8]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [9]:
def modelEvaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [10]:
fill_dict ={'start_lat': 40.79865054, 'start_lon': -74.2404433, 'end_lat': 40.79165072 , 'end_lon':-74.28593088,'usertype':'agent',
            'hour': 24, 'temp': 38.6, 'feelslike':49.4,'dew':-25.7,'snowdepth':5.10,'winddir':380.0,'sealevelpressure':819.3,
            'visibility':7.2,'solarradiation': 1109,'conditions': 'muddy','dist': 5.345,'birthyear': 2003,
            'holiday':'unknown','day': 'special','month': 'unknown','gender': 'nonbinary'
        }

In [11]:
def ampute_each_variables(variable, method, X_train, y_train, X_test, y_test):
    X_train  = induceMissingValues(X_train,variable,perc=40)
    print('Train set after Amputation', X_train.shape, y_train.shape)
    print("X_train missing value count of varaible ", X_train.isnull().sum()[variable])
    X_test  = induceMissingValues(X_test,variable,perc=40)
    print('Test set after Amputation', X_test.shape, y_test.shape)
    print("X_test missing value count of varaible ", X_test.isnull().sum()[variable])
    start = datetime.datetime.now()
    if method == "bad":
        print("Simulate Imputation for NA")
        X_train.fillna(fill_dict[variable], inplace=True)
        print('Train  after imputation', X_train.shape, y_train.shape)
        X_test.fillna(fill_dict[variable], inplace=True)
        print('Test  after imputation', X_test.shape, y_test.shape)
    end = datetime.datetime.now()
    comp_time = (end-start)
    print(f"Time taken simulate for variable {var} = ", comp_time)
    list_comp_time.append(comp_time)
    return X_train , y_train, X_test , y_test

In [12]:
variables = ['start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [13]:
result = {}
list_comp_time =[]
for var in variables:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"bad",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = modelEvaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    result[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  100938
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  25235
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lat =  0:00:00.263112
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  100938
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  25235
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lon =  0:00:00.310771
Now running model is .... end_lat
Train (252346, 21) (252346,)
Test (63087, 21

Now running model is .... day
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  100938
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  25235
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable day =  0:00:00.286119
Now running model is .... month
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  100938
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  25235
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable month =  0:00:00.290165
Now running model is .... gender
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set 

In [14]:
print("averge time to run drop is : ", 0.2806097)
list_comp_time

averge time to run drop is :  4.117488


[datetime.timedelta(microseconds=263112),
 datetime.timedelta(microseconds=310771),
 datetime.timedelta(microseconds=402761),
 datetime.timedelta(microseconds=259773),
 datetime.timedelta(microseconds=222528),
 datetime.timedelta(microseconds=246631),
 datetime.timedelta(microseconds=266797),
 datetime.timedelta(microseconds=250277),
 datetime.timedelta(microseconds=280586),
 datetime.timedelta(microseconds=288131),
 datetime.timedelta(microseconds=243111),
 datetime.timedelta(microseconds=214053),
 datetime.timedelta(microseconds=292930),
 datetime.timedelta(microseconds=282595),
 datetime.timedelta(microseconds=339652),
 datetime.timedelta(microseconds=292945),
 datetime.timedelta(microseconds=296645),
 datetime.timedelta(microseconds=243261),
 datetime.timedelta(microseconds=286119),
 datetime.timedelta(microseconds=290165),
 datetime.timedelta(microseconds=319961)]

In [15]:
result

{'start_lat': {'MAE': 1.5719873230990953,
  'MSE': 5.000611805716736,
  'RMSE': 2.2362047772323392,
  'R2': 0.5271412854097159},
 'start_lon': {'MAE': 1.5742342701700125,
  'MSE': 5.010908680234609,
  'RMSE': 2.238505903551431,
  'R2': 0.5261676111798441},
 'end_lat': {'MAE': 1.5722069866954655,
  'MSE': 5.002657666430502,
  'RMSE': 2.2366621708319077,
  'R2': 0.5269478284678601},
 'end_lon': {'MAE': 1.5750196684925573,
  'MSE': 5.008754505942199,
  'RMSE': 2.238024688412127,
  'R2': 0.5263713102723727},
 'usertype': {'MAE': 1.5839424355834397,
  'MSE': 5.067412812149291,
  'RMSE': 2.2510914712977104,
  'R2': 0.52082457072314},
 'hour': {'MAE': 1.575420916216848,
  'MSE': 5.017787474197703,
  'RMSE': 2.2400418465282526,
  'R2': 0.5255171512366025},
 'temp': {'MAE': 1.5726135886452866,
  'MSE': 5.001097203130744,
  'RMSE': 2.236313306120308,
  'R2': 0.5270953861465515},
 'feelslike': {'MAE': 1.572445237001668,
  'MSE': 5.000253782099334,
  'RMSE': 2.2361247241822926,
  'R2': 0.527175140