In [1]:
import pandas as pd
import numpy as np
import random
import time
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

from sklearn.preprocessing import OneHotEncoder


In [2]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [3]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [4]:
df = tripdata[selected_features]

In [5]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [6]:
def getSampleSize(df, perc=1):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  
        
random.seed(100)
def induceMissingValues(X_train,col_name, perc):
    random.seed(100)
    index = getAnIndex(X_train.index)
    sample_size = getSampleSize(X_train, perc)
    selected_index = random.sample(index, sample_size)
    for i in selected_index:
        if col_name in X_train.columns:
            X_train.at[i,col_name]= np.NaN
        else:
            X_train.loc[i] = np.NaN
    return X_train


In [7]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [8]:
def modelEvaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [9]:
fill_dict ={'start_lat': 40.79865054, 'start_lon': -74.2404433, 'end_lat': 40.79165072 , 'end_lon':-74.28593088,'usertype':'agent',
            'hour': 24, 'temp': 38.6, 'feelslike':49.4,'dew':-25.7,'snowdepth':5.10,'winddir':380.0,'sealevelpressure':819.3,
            'visibility':7.2,'solarradiation': 1109,'conditions': 'muddy','dist': 5.345,'birthyear': 2003,
            'holiday':'unknown','day': 'special','month': 'unknown','gender': 'nonbinary'
        }

In [10]:
def ampute_each_variables(variable, method, X_train, y_train, X_test, y_test):
    X_train  = induceMissingValues(X_train,variable,perc=30)
    print('Train set after Amputation', X_train.shape, y_train.shape)
    print("X_train missing value count of varaible ", X_train.isnull().sum()[variable])
    X_test  = induceMissingValues(X_test,variable,perc=30)
    print('Test set after Amputation', X_test.shape, y_test.shape)
    print("X_test missing value count of varaible ", X_test.isnull().sum()[variable])
    start = datetime.datetime.now()
    if method == "bad":
        print("Simulate Imputation for NA")
        X_train.fillna(fill_dict[variable], inplace=True)
        print('Train  after imputation', X_train.shape, y_train.shape)
        X_test.fillna(fill_dict[variable], inplace=True)
        print('Test  after imputation', X_test.shape, y_test.shape)
    end = datetime.datetime.now()
    comp_time = (end-start)
    print(f"Time taken simulate for variable {var} = ", comp_time)
    list_comp_time.append(comp_time)
    return X_train , y_train, X_test , y_test

In [11]:
variables = ['start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [12]:
result = {}
list_comp_time =[]
for var in variables:
    print("Now running model is ....", var)
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_train , y_train, X_test , y_test = ampute_each_variables(var,"bad",X_train,y_train,X_test,y_test)
    X_train, X_test = one_hot_encoding(X_train,X_test)
    model_reg = modelEvaluation(X_train,y_train)
    y_pred = model_reg.predict(X_test)
    result[var] = {"MAE":mean_absolute_error(y_test, y_pred),"MSE":mean_squared_error(y_test, y_pred),
                          "RMSE":np.sqrt(mean_squared_error(y_test, y_pred)),"R2":r2_score(y_test, y_pred)}
    del X_train
    del X_test
    del y_train
    del y_test

Now running model is .... start_lat
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  75704
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  18926
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lat =  0:00:00.293178
Now running model is .... start_lon
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  75704
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  18926
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable start_lon =  0:00:00.338155
Now running model is .... end_lat
Train (252346, 21) (252346,)
Test (63087, 21) 

Now running model is .... day
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  75704
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  18926
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable day =  0:00:00.356614
Now running model is .... month
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set after Amputation (252346, 21) (252346,)
X_train missing value count of varaible  75704
Test set after Amputation (63087, 21) (63087,)
X_test missing value count of varaible  18926
Simulate Imputation for NA
Train  after imputation (252346, 21) (252346,)
Test  after imputation (63087, 21) (63087,)
Time taken simulate for variable month =  0:00:00.227059
Now running model is .... gender
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
Train set af

In [13]:
print("averge time to run drop is : ", 0.288492)
list_comp_time

averge time to run drop is :  4.117488


[datetime.timedelta(microseconds=293178),
 datetime.timedelta(microseconds=338155),
 datetime.timedelta(microseconds=350231),
 datetime.timedelta(microseconds=238181),
 datetime.timedelta(microseconds=267248),
 datetime.timedelta(microseconds=340526),
 datetime.timedelta(microseconds=255516),
 datetime.timedelta(microseconds=291060),
 datetime.timedelta(microseconds=224365),
 datetime.timedelta(microseconds=258953),
 datetime.timedelta(microseconds=261051),
 datetime.timedelta(microseconds=264753),
 datetime.timedelta(microseconds=316239),
 datetime.timedelta(microseconds=287115),
 datetime.timedelta(microseconds=340512),
 datetime.timedelta(microseconds=284019),
 datetime.timedelta(microseconds=348076),
 datetime.timedelta(microseconds=254338),
 datetime.timedelta(microseconds=356614),
 datetime.timedelta(microseconds=227059),
 datetime.timedelta(microseconds=261153)]

In [14]:
result

{'start_lat': {'MAE': 1.5719921567587882,
  'MSE': 5.0006358830751045,
  'RMSE': 2.2362101607575045,
  'R2': 0.527139008650557},
 'start_lon': {'MAE': 1.5742225813608184,
  'MSE': 5.010787185332141,
  'RMSE': 2.2384787658881513,
  'R2': 0.5261790997587705},
 'end_lat': {'MAE': 1.5722141923085649,
  'MSE': 5.002721074515767,
  'RMSE': 2.236676345499225,
  'R2': 0.5269418325883841},
 'end_lon': {'MAE': 1.5750213592717242,
  'MSE': 5.008792341804309,
  'RMSE': 2.238033141355219,
  'R2': 0.5263677325067277},
 'usertype': {'MAE': 1.5801930065504648,
  'MSE': 5.044873266692602,
  'RMSE': 2.246079532583965,
  'R2': 0.5229559140279563},
 'hour': {'MAE': 1.5752289346467603,
  'MSE': 5.016312679759647,
  'RMSE': 2.239712633299113,
  'R2': 0.5256566080529598},
 'temp': {'MAE': 1.5726175223266172,
  'MSE': 5.001108619333267,
  'RMSE': 2.236315858579299,
  'R2': 0.5270943066284726},
 'feelslike': {'MAE': 1.572445776356681,
  'MSE': 5.000248836619698,
  'RMSE': 2.2361236183672175,
  'R2': 0.52717560