In [21]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [22]:
#!pip install category_encoders

In [23]:
tripdata = pd.read_csv("/home/jovyan/data/final_bike_sharing.csv")

In [24]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,ss_3273,Manila & 1st,40.721651,-74.042884,es_3273,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,ss_3198,Heights Elevator,40.748716,-74.040443,es_3198,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,ss_3213,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717733,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,ss_3272,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [25]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [26]:
df = tripdata[selected_features]
df.shape

(315433, 22)

In [27]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [28]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [29]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=5, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=5, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [30]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [31]:
def model_evaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [32]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=True, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multivariate simulation with 2 variables

In [13]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 15:21:07.976695
run_multiple_imputation iteration 0 stopped 2023-03-21 15:24:11.865705
run_multiple_imputation iteration 1 started 2023-03-21 15:24:12.007664
run_multiple_imputation iteration 1 stopped 2023-03-21 15:27:21.143665
run_multiple_imputation iteration 2 started 2023-03-21 15:27:21.299053
run_multiple_imputation iteration 2 stopped 2023-03-21 15:30:31.429973
run_multiple_imputation iteration 3 started 2023-03-21 15:30:31.575699
run_multiple_imputation iteration 3 stopped 2023-03-21 15:33:41.520207
run_multiple_imputation iteration 4 started 2023-03-21 15:33:41.663083
run_multiple_imputation iteration 4 stopped 2023-03-21 15:37:04.378640
run_multiple_imputation iteration 5 started 2023-03

In [14]:
results

{('dist', 'birthyear'): {'rmse': 3.054976052081298, 'r2': 0.11748138289105103},
 ('start_lat', 'start_lon'): {'rmse': 2.23846244303506,
  'r2': 0.5261860098808282},
 ('end_lat', 'end_lon'): {'rmse': 2.2386243442891107,
  'r2': 0.5261174683213605},
 ('hour', 'temp'): {'rmse': 2.241471057860043, 'r2': 0.5249114905874535},
 ('feelslike', 'dew'): {'rmse': 2.2361006393607465, 'r2': 0.5271853255209036},
 ('snowdepth', 'winddir'): {'rmse': 2.2362378861591186,
  'r2': 0.5271272831496217},
 ('sealevelpressure', 'visibility'): {'rmse': 2.235943415215413,
  'r2': 0.5272518120362011},
 ('solarradiation', 'dist'): {'rmse': 3.0553909183322245,
  'r2': 0.11724167426887044}}

### Multivariate simulation with 3 variables

In [15]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 17:42:00.334648
run_multiple_imputation iteration 0 stopped 2023-03-21 17:44:38.669345
run_multiple_imputation iteration 1 started 2023-03-21 17:44:38.809365
run_multiple_imputation iteration 1 stopped 2023-03-21 17:47:23.335523
run_multiple_imputation iteration 2 started 2023-03-21 17:47:23.471622
run_multiple_imputation iteration 2 stopped 2023-03-21 17:50:05.764523
run_multiple_imputation iteration 3 started 2023-03-21 17:50:05.898072
run_multiple_imputation iteration 3 stopped 2023-03-21 17:52:46.931392
run_multiple_imputation iteration 4 started 2023-03-21 17:52:47.067243
run_multiple_imputation iteration 4 stopped 2023-03-21 17:55:28.542131
run_multiple_imputation iteration 5 started 2023-03

In [16]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 3.0676455402971845,
  'r2': 0.11014630472699471},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.239232322553954,
  'r2': 0.5258600339703094},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.245603567053232,
  'r2': 0.523158072927989},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2437890999769063,
  'r2': 0.5239283463990414},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.236406396548184,
  'r2': 0.527056014374276},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.236342616978617,
  'r2': 0.5270829895553133},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.237328352979867, 'r2': 0.5266659925720687},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 3.05436861408761,
  'r2': 0.11783230023613012}}

### Multivariate simulation with 4 variables

In [17]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
results ={}
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 19:54:09.768638
run_multiple_imputation iteration 0 stopped 2023-03-21 19:56:45.626555
run_multiple_imputation iteration 1 started 2023-03-21 19:56:45.758520
run_multiple_imputation iteration 1 stopped 2023-03-21 19:59:19.236625
run_multiple_imputation iteration 2 started 2023-03-21 19:59:19.373587
run_multiple_imputation iteration 2 stopped 2023-03-21 20:01:59.551319
run_multiple_imputation iteration 3 started 2023-03-21 20:01:59.687936
run_multiple_imputation iteration 3 stopped 2023-03-21 20:04:40.306415
run_multiple_imputation iteration 4 started 2023-03-21 20:04:40.445247
run_multiple_imputation iteration 4 stopped 2023-03-21 20:07:21.254186
run_multiple_imputation iteration 5 started 2023-03

In [18]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 3.079584743473147,
  'r2': 0.10320624702972125},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2397990411964517,
  'r2': 0.5256200071023354},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.2461494986432013,
  'r2': 0.5229261934288916},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.244279965314503,
  'r2': 0.5237200267896891},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.236440002225727,
  'r2': 0.5270418007478839},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2363015476985484, 'r2': 0.5271003591503503},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.0525646451792308, 'r2': 0.11887404296381332},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.0656494381495083, 'r2': 0.11130397496881095}}

### Multivaraite simulation with 5 variables 

In [None]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 22:04:49.904910
run_multiple_imputation iteration 0 stopped 2023-03-21 22:07:27.744854
run_multiple_imputation iteration 1 started 2023-03-21 22:07:27.873217
run_multiple_imputation iteration 1 stopped 2023-03-21 22:10:03.735942
run_multiple_imputation iteration 2 started 2023-03-21 22:10:03.864072
run_multiple_imputation iteration 2 stopped 2023-03-21 22:12:37.993814
run_multiple_imputation iteration 3 started 2023-03-21 22:12:38.118034
run_multiple_imputation iteration 3 stopped 2023-03-21 22:15:09.100485
run_multiple_imputation iteration 4 started 2023-03-21 22:15:09.228522
run_multiple_imputation iteration 4 stopped 2023-03-21 22:17:48.041702
run_multiple_imputation iteration 5 started 2023-03

In [17]:
results ={}
col_list = [
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 13:07:04.889081
run_multiple_imputation iteration 0 stopped 2023-03-22 13:09:13.310827
run_multiple_imputation iteration 1 started 2023-03-22 13:09:13.431930
run_multiple_imputation iteration 1 stopped 2023-03-22 13:11:20.842825
run_multiple_imputation iteration 2 started 2023-03-22 13:11:20.963218
run_multiple_imputation iteration 2 stopped 2023-03-22 13:13:29.612351
run_multiple_imputation iteration 3 started 2023-03-22 13:13:29.731979
run_multiple_imputation iteration 3 stopped 2023-03-22 13:15:38.545278
run_multiple_imputation iteration 4 started 2023-03-22 13:15:38.665783
run_multiple_imputation iteration 4 stopped 2023-03-22 13:17:47.468169
run_multiple_imputation iteration 5 started 2023-03

In [18]:
results

{('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2364786712321494, 'r2': 0.527025445306435},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.237654259913374, 'r2': 0.5265280833820725},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.0541433355135297, 'r2': 0.1179624260838581},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.082456754080198, 'r2': 0.10153277324406162}}

### Multivaraite simulation with 6 variables 

In [33]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 16:13:58.187435
run_multiple_imputation iteration 0 stopped 2023-03-22 16:16:09.455224
run_multiple_imputation iteration 1 started 2023-03-22 16:16:09.573525
run_multiple_imputation iteration 1 stopped 2023-03-22 16:18:21.887546
run_multiple_imputation iteration 2 started 2023-03-22 16:18:22.005835
run_multiple_imputation iteration 2 stopped 2023-03-22 16:20:33.166084
run_multiple_imputation iteration 3 started 2023-03-22 16:20:33.414568
run_multiple_imputation iteration 3 stopped 2023-03-22 16:22:46.779027
run_multiple_imputation iteration 4 started 2023-03-22 16:22:46.896533
run_multiple_imputation iteration 4 stopped 2023-03-22 16:25:01.434838
run_multiple_imputation iteration 5 started 2023-03

In [34]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 2.9805951064830047, 'r2': 0.15993242199345126},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.249158996925762, 'r2': 0.5216469244866804},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.2444509647468984, 'r2': 0.5236474451964177},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2363554546073456, 'r2': 0.5270775600201616},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.053860334999093, 'r2': 0.1181258797890995},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.0695919930971423, 'r2': 0.10901670373602179},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 3.0684759678671254, 'r2': 0.10966446348941761}}

### Multivaraite simulation with 7 variables 

In [35]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 17:56:15.347032
run_multiple_imputation iteration 0 stopped 2023-03-22 17:58:37.120587
run_multiple_imputation iteration 1 started 2023-03-22 17:58:37.239098
run_multiple_imputation iteration 1 stopped 2023-03-22 18:00:50.376857
run_multiple_imputation iteration 2 started 2023-03-22 18:00:50.495390
run_multiple_imputation iteration 2 stopped 2023-03-22 18:02:59.886046
run_multiple_imputation iteration 3 started 2023-03-22 18:03:00.005169
run_multiple_imputation iteration 3 stopped 2023-03-22 18:05:10.621813
run_multiple_imputation iteration 4 started 2023-03-22 18:05:10.740484
run_multiple_imputation iteration 4 stopped 2023-03-22 18:07:26.092225
run_multiple_imputation iteration 5 started 2023-03

In [36]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 2.976810370580599, 'r2': 0.16206448971784226},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.249909520721311, 'r2': 0.5213276271258355},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2444056700286916, 'r2': 0.5236666713121702},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2379909336324855, 'r2': 0.5263855970776736},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.0568151318608274, 'r2': 0.11641851978405882},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.082098433884245, 'r2': 0.10174164575487599},
 ('solarradiation',
  'dist',
  'birthyear',
  'sta

### with all the variables

In [37]:
results ={}
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']
multiple_predictions = []
X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
for i in range(6):
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=i
    )
    multiple_predictions.append(y_predict)
    del X_train_data
    del X_test_data
predictions_average = np.mean(multiple_predictions, axis=0)
rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
r2 = r2_score(y_test, predictions_average)
print("results are saving for the columns", cols, " = ", rmse, r2)
results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 19:41:01.515508
run_multiple_imputation iteration 0 stopped 2023-03-22 19:43:31.599569
run_multiple_imputation iteration 1 started 2023-03-22 19:43:31.702847
run_multiple_imputation iteration 1 stopped 2023-03-22 19:45:58.296295
run_multiple_imputation iteration 2 started 2023-03-22 19:45:58.400341
run_multiple_imputation iteration 2 stopped 2023-03-22 19:48:22.078127
run_multiple_imputation iteration 3 started 2023-03-22 19:48:22.187505
run_multiple_imputation iteration 3 stopped 2023-03-22 19:50:56.309328
run_multiple_imputation iteration 4 started 2023-03-22 19:50:56.411934
run_multiple_imputation iteration 4 stopped 2023-03-22 19:53:22.366527
run_multiple_imputation iteration 5 started 2023-03

In [38]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.9820866047956875, 'r2': 0.15909146720248546}}

### Test Ignore

In [None]:
# # Execute the simulation
# print("Executing MCAR Missingness...MI", datetime.datetime.now())

# X = df.drop('tripduration', axis=1)
# y = df['tripduration']
# cols = ['dist','birthyear']
# rmse, r2 = perform_simulation(X, y, cols, n_simulation=N_SIM)
# print(rmse, r2)
# print("stooped MCAR Missingness...MI", datetime.datetime.now())

In [None]:
# # Plot results
# n_situations = 3
# n = np.arange(n_situations)
# n_labels = ["Full Data", "Single Imputation", "Multiple Imputations"]
# colors = ["r", "orange", "green"]

# plt.figure(figsize=(12, 6))
# ax1 = plt.subplot(111)
# for j in n:
#     ax1.barh(
#         j,
#         mse_means[j],
#         xerr=mse_std[j],
#         color=colors[j],
#         alpha=0.6,
#         align="center",
#     )

# ax1.set_title("MCAR Missingness")
# ax1.set_yticks(n)
# ax1.set_xlabel("Mean Squared Error")
# ax1.invert_yaxis()
# ax1.set_yticklabels(n_labels)
# plt.show()