In [1]:
import pandas as pd
import numpy as np
import random
import time
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

from sklearn.preprocessing import OneHotEncoder


In [2]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [3]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [4]:
df = tripdata[selected_features]

In [5]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [6]:
def getSampleSize(df, perc=1):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  
        
random.seed(100)
def induceMissingValues(X_train,col_name, perc):
    random.seed(100)
    index = getAnIndex(X_train.index)
    sample_size = getSampleSize(X_train, perc)
    selected_index = random.sample(index, sample_size)
    for i in selected_index:
        if col_name in X_train.columns:
            X_train.at[i,col_name]= np.NaN
        else:
            X_train.loc[i] = np.NaN
    return X_train


In [7]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [8]:
def modelEvaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [9]:
fill_dict ={'start_lat': 40.79865054, 'start_lon': -74.2404433, 'end_lat': 40.79165072 , 'end_lon':-74.28593088,'usertype':'agent',
            'hour': 24, 'temp': 38.6, 'feelslike':49.4,'dew':-25.7,'snowdepth':5.10,'winddir':380.0,'sealevelpressure':819.3,
            'visibility':7.2,'solarradiation': 1109,'conditions': 'muddy','dist': 5.345,'birthyear': 2003,
            'holiday':'unknown','day': 'special','month': 'unknown','gender': 'nonbinary'
        }

In [10]:
def ampute_each_variables(variable, method, X_train, y_train, X_test, y_test):
    X_train  = induceMissingValues(X_train,variable,perc=1)
    print('Train set after Amputation', X_train.shape, y_train.shape)
    print("X_train missing value count of varaible ", X_train.isnull().sum()[variable])
    X_test  = induceMissingValues(X_test,variable,perc=1)
    print('Test set after Amputation', X_test.shape, y_test.shape)
    print("X_test missing value count of varaible ", X_test.isnull().sum()[variable])
    start = datetime.datetime.now()
    if method == "bad":
        print("Simulate Imputation for NA")
        X_train.fillna(fill_dict[variable], inplace=True)
        print('Train  after imputation', X_train.shape, y_train.shape)
        X_test.fillna(fill_dict[variable], inplace=True)
        print('Test  after imputation', X_test.shape, y_test.shape)
    end = datetime.datetime.now()
    comp_time = (end-start)
    print(f"Time taken simulate for variable {var} = ", comp_time)
    list_comp_time.append(comp_time)
   
    return X_train , y_train, X_test , y_test

In [11]:
variables = ['start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [12]:
result = {}
list_comp_time =[]
for var in variables:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"bad",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = modelEvaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    result[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  2523
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  631
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lat =  0:00:00.267404
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  2523
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  631
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lon =  0:00:00.325819
Now running model is .... end_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087

Now running model is .... day
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  2523
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  631
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable day =  0:00:00.243764
Now running model is .... month
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  2523
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  631
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable month =  0:00:00.282145
Now running model is .... gender
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Am

In [15]:
print("averge time to bad imputation: ", 0.27313942)
list_comp_time

averge time to bad imputation:  273139.4286


[datetime.timedelta(microseconds=267404),
 datetime.timedelta(microseconds=325819),
 datetime.timedelta(microseconds=262101),
 datetime.timedelta(microseconds=238389),
 datetime.timedelta(microseconds=269278),
 datetime.timedelta(microseconds=261952),
 datetime.timedelta(microseconds=264732),
 datetime.timedelta(microseconds=241984),
 datetime.timedelta(microseconds=263173),
 datetime.timedelta(microseconds=268972),
 datetime.timedelta(microseconds=323658),
 datetime.timedelta(microseconds=355553),
 datetime.timedelta(microseconds=226678),
 datetime.timedelta(microseconds=255075),
 datetime.timedelta(microseconds=344168),
 datetime.timedelta(microseconds=271219),
 datetime.timedelta(microseconds=302588),
 datetime.timedelta(microseconds=231598),
 datetime.timedelta(microseconds=243764),
 datetime.timedelta(microseconds=282145),
 datetime.timedelta(microseconds=235678)]

In [14]:
result

{'start_lat': {'MAE': 1.572068318029329,
  'MSE': 5.000193979554452,
  'RMSE': 2.236111352226103,
  'R2': 0.527180795123666},
 'start_lon': {'MAE': 1.574046202418556,
  'MSE': 5.0095557716545045,
  'RMSE': 2.23820369306605,
  'R2': 0.5262955424484571},
 'end_lat': {'MAE': 1.5722213439198123,
  'MSE': 5.0013643606750104,
  'RMSE': 2.236373037012164,
  'R2': 0.5270701236831004},
 'end_lon': {'MAE': 1.5747605426102627,
  'MSE': 5.0078142645116,
  'RMSE': 2.2378146179948866,
  'R2': 0.5264602196641734},
 'usertype': {'MAE': 1.572401457838678,
  'MSE': 4.99975412509591,
  'RMSE': 2.236012997523921,
  'R2': 0.5272223878371087},
 'hour': {'MAE': 1.572536647422998,
  'MSE': 5.001007246092532,
  'RMSE': 2.236293193231275,
  'R2': 0.5271038924995959},
 'temp': {'MAE': 1.5725548025737952,
  'MSE': 5.000781591144171,
  'RMSE': 2.2362427397633224,
  'R2': 0.5271252304704219},
 'feelslike': {'MAE': 1.5724438321846028,
  'MSE': 5.000255971069955,
  'RMSE': 2.2361252136385286,
  'R2': 0.52717493319527