In [13]:
import pandas as pd
import numpy as np
import random
import time
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

from sklearn.preprocessing import OneHotEncoder


In [14]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [15]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [16]:
df = tripdata[selected_features]

In [17]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [18]:
def getSampleSize(df, perc=1):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  
        
random.seed(100)
def induceMissingValues(X_train,col_name, perc):
    random.seed(100)
    index = getAnIndex(X_train.index)
    sample_size = getSampleSize(X_train, perc)
    selected_index = random.sample(index, sample_size)
    for i in selected_index:
        if col_name in X_train.columns:
            X_train.at[i,col_name]= np.NaN
        else:
            X_train.loc[i] = np.NaN
    return X_train


In [19]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [20]:
def modelEvaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [21]:
fill_dict ={'start_lat': 40.79865054, 'start_lon': -74.2404433, 'end_lat': 40.79165072 , 'end_lon':-74.28593088,'usertype':'agent',
            'hour': 24, 'temp': 38.6, 'feelslike':49.4,'dew':-25.7,'snowdepth':5.10,'winddir':380.0,'sealevelpressure':819.3,
            'visibility':7.2,'solarradiation': 1109,'conditions': 'muddy','dist': 5.345,'birthyear': 2003,
            'holiday':'unknown','day': 'special','month': 'unknown','gender': 'nonbinary'
        }

In [22]:
def ampute_each_variables(variable, method, X_train, y_train, X_test, y_test):
    X_train  = induceMissingValues(X_train,variable,perc=20)
    print('Train set after Amputation', X_train.shape, y_train.shape)
    print("X_train missing value count of varaible ", X_train.isnull().sum()[variable])
    X_test  = induceMissingValues(X_test,variable,perc=20)
    print('Test set after Amputation', X_test.shape, y_test.shape)
    print("X_test missing value count of varaible ", X_test.isnull().sum()[variable])
    start = datetime.datetime.now()
    if method == "bad":
        print("Simulate Imputation for NA")
        X_train.fillna(fill_dict[variable], inplace=True)
        print('Train  after imputation', X_train.shape, y_train.shape)
        X_test.fillna(fill_dict[variable], inplace=True)
        print('Test  after imputation', X_test.shape, y_test.shape)
    end = datetime.datetime.now()
    comp_time = (end-start)
    print(f"Time taken simulate for variable {var} = ", comp_time)
    list_comp_time.append(comp_time)
    return X_train , y_train, X_test , y_test

In [23]:
variables = ['start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [24]:
result = {}
list_comp_time =[]
for var in variables:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"bad",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = modelEvaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    result[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  50469
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  12617
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lat =  0:00:00.230152
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  50469
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  12617
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lon =  0:00:00.284337
Now running model is .... end_lat
Train (252346, 21) (252346,)
Test (63087, 21) 

Now running model is .... day
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  50469
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  12617
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable day =  0:00:00.381851
Now running model is .... month
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  50469
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  12617
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable month =  0:00:00.266049
Now running model is .... gender
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set af

In [25]:
print("averge time to run drop is : ", 0.299317)
list_comp_time

averge time to run drop is :  4.117488


[datetime.timedelta(microseconds=230152),
 datetime.timedelta(microseconds=284337),
 datetime.timedelta(microseconds=295180),
 datetime.timedelta(microseconds=336405),
 datetime.timedelta(microseconds=363094),
 datetime.timedelta(microseconds=262034),
 datetime.timedelta(microseconds=409922),
 datetime.timedelta(microseconds=236627),
 datetime.timedelta(microseconds=271112),
 datetime.timedelta(microseconds=230315),
 datetime.timedelta(microseconds=277880),
 datetime.timedelta(microseconds=348480),
 datetime.timedelta(microseconds=297844),
 datetime.timedelta(microseconds=303623),
 datetime.timedelta(microseconds=285001),
 datetime.timedelta(microseconds=260085),
 datetime.timedelta(microseconds=303753),
 datetime.timedelta(microseconds=280943),
 datetime.timedelta(microseconds=381851),
 datetime.timedelta(microseconds=266049),
 datetime.timedelta(microseconds=360986)]

In [26]:
result

{'start_lat': {'MAE': 1.572026336743283,
  'MSE': 5.000675401118596,
  'RMSE': 2.236218996681362,
  'R2': 0.5271352718175504},
 'start_lon': {'MAE': 1.5741946083500888,
  'MSE': 5.010777757087644,
  'RMSE': 2.2384766599381027,
  'R2': 0.5261799912951959},
 'end_lat': {'MAE': 1.5722607936398731,
  'MSE': 5.002726593510927,
  'RMSE': 2.236677579248052,
  'R2': 0.5269413107112502},
 'end_lon': {'MAE': 1.5750357659002674,
  'MSE': 5.008806464380441,
  'RMSE': 2.238036296484139,
  'R2': 0.5263663970734935},
 'usertype': {'MAE': 1.5780169917598579,
  'MSE': 5.032049866430404,
  'RMSE': 2.2432230977837233,
  'R2': 0.5241684969678533},
 'hour': {'MAE': 1.5751026172029459,
  'MSE': 5.013779385750105,
  'RMSE': 2.239147021914842,
  'R2': 0.525896156771311},
 'temp': {'MAE': 1.572623314295272,
  'MSE': 5.001094147001354,
  'RMSE': 2.2363126228238652,
  'R2': 0.5270956751346736},
 'feelslike': {'MAE': 1.57244147023324,
  'MSE': 5.000244793086493,
  'RMSE': 2.236122714228021,
  'R2': 0.527175990187