In [20]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [21]:
#!pip install category_encoders

In [22]:
tripdata = pd.read_csv("/home/jovyan/data/final_bike_sharing.csv")

In [23]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,ss_3273,Manila & 1st,40.721651,-74.042884,es_3273,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,ss_3198,Heights Elevator,40.748716,-74.040443,es_3198,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,ss_3213,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717733,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,ss_3272,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [24]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [25]:
df = tripdata[selected_features]
df.shape

(315433, 22)

In [26]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [27]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [28]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=1, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=1, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [29]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [30]:
def model_evaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [31]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=True, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multivariate simulation with 2 variables

In [32]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-20 17:33:25.836133
run_multiple_imputation iteration 0 stopped 2023-03-20 17:36:19.307880
run_multiple_imputation iteration 1 started 2023-03-20 17:36:19.422142
run_multiple_imputation iteration 1 stopped 2023-03-20 17:39:23.726313
run_multiple_imputation iteration 2 started 2023-03-20 17:39:23.919316
run_multiple_imputation iteration 2 stopped 2023-03-20 17:43:10.030601
run_multiple_imputation iteration 3 started 2023-03-20 17:43:10.221378
run_multiple_imputation iteration 3 stopped 2023-03-20 17:46:52.328291
run_multiple_imputation iteration 4 started 2023-03-20 17:46:52.479000
run_multiple_imputation iteration 4 stopped 2023-03-20 17:50:31.996736
run_multiple_imputation iteration 5 started 2023-03-2

In [33]:
results

{('dist', 'birthyear'): {'rmse': 2.8728345362278267,
  'r2': 0.21957803947552057},
 ('start_lat', 'start_lon'): {'rmse': 2.237934336107128,
  'r2': 0.52640955165571},
 ('end_lat', 'end_lon'): {'rmse': 2.237639728672211, 'r2': 0.5265342327791334},
 ('hour', 'temp'): {'rmse': 2.2400652783313553, 'r2': 0.5255072245945135},
 ('feelslike', 'dew'): {'rmse': 2.2361214870766584, 'r2': 0.5271765091449416},
 ('snowdepth', 'winddir'): {'rmse': 2.236083884463664,
  'r2': 0.5271924110083737},
 ('sealevelpressure', 'visibility'): {'rmse': 2.235965497169731,
  'r2': 0.5272424743621189},
 ('solarradiation', 'dist'): {'rmse': 2.879925773245274,
  'r2': 0.21572053413210102}}

### Multivariate simulation with 3 variables

In [34]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-20 21:12:43.441635
run_multiple_imputation iteration 0 stopped 2023-03-20 21:17:35.649829
run_multiple_imputation iteration 1 started 2023-03-20 21:17:35.891106
run_multiple_imputation iteration 1 stopped 2023-03-20 21:22:24.774257
run_multiple_imputation iteration 2 started 2023-03-20 21:22:24.914497
run_multiple_imputation iteration 2 stopped 2023-03-20 21:27:39.175315
run_multiple_imputation iteration 3 started 2023-03-20 21:27:39.323468
run_multiple_imputation iteration 3 stopped 2023-03-20 21:32:17.681140
run_multiple_imputation iteration 4 started 2023-03-20 21:32:17.828090
run_multiple_imputation iteration 4 stopped 2023-03-20 21:37:14.925908
run_multiple_imputation iteration 5 started 2023-03-2

In [35]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 2.871918501085032,
  'r2': 0.22007565251734806},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.2379551660815666,
  'r2': 0.5264007355589337},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.24230021720317,
  'r2': 0.5245599384808917},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.24095831947017,
  'r2': 0.5251288195198087},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.2361236220856413,
  'r2': 0.5271756062576092},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.2362103270347684,
  'r2': 0.527138938329765},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.236770638140198, 'r2': 0.526901945866047},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 2.8681492392778574,
  'r2': 0.22212153920789957}}

### Multivariate simulation with 4 variables

In [36]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
results ={}
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 01:05:51.446560
run_multiple_imputation iteration 0 stopped 2023-03-21 01:10:46.936716
run_multiple_imputation iteration 1 started 2023-03-21 01:10:47.092448
run_multiple_imputation iteration 1 stopped 2023-03-21 01:15:33.140660
run_multiple_imputation iteration 2 started 2023-03-21 01:15:33.283974
run_multiple_imputation iteration 2 stopped 2023-03-21 01:20:15.506048
run_multiple_imputation iteration 3 started 2023-03-21 01:20:15.655175
run_multiple_imputation iteration 3 stopped 2023-03-21 01:25:06.628666
run_multiple_imputation iteration 4 started 2023-03-21 01:25:06.784365
run_multiple_imputation iteration 4 stopped 2023-03-21 01:29:56.901334
run_multiple_imputation iteration 5 started 2023-03-2

In [37]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 2.858656749344346,
  'r2': 0.22726198603494752},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2387774442725217,
  'r2': 0.5260526482555672},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.2425408743659236,
  'r2': 0.5244578788232203},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.2420122052168514,
  'r2': 0.5246820663220324},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.236229462142929,
  'r2': 0.5271308458116657},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2362442703611176, 'r2': 0.5271245831513144},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.883954473186113, 'r2': 0.2135247576628001},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.860446549456913, 'r2': 0.22629406321857592}}

### Multivaraite simulation with 5 variables 

In [38]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 04:57:24.909083
run_multiple_imputation iteration 0 stopped 2023-03-21 05:02:03.319101
run_multiple_imputation iteration 1 started 2023-03-21 05:02:03.478632
run_multiple_imputation iteration 1 stopped 2023-03-21 05:06:41.336652
run_multiple_imputation iteration 2 started 2023-03-21 05:06:41.487605
run_multiple_imputation iteration 2 stopped 2023-03-21 05:11:41.888690
run_multiple_imputation iteration 3 started 2023-03-21 05:11:42.035805
run_multiple_imputation iteration 3 stopped 2023-03-21 05:16:42.846164
run_multiple_imputation iteration 4 started 2023-03-21 05:16:42.996926
run_multiple_imputation iteration 4 stopped 2023-03-21 05:21:34.506755
run_multiple_imputation iteration 5 started 2023-03-2

In [39]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.8254982199443606, 'r2': 0.24508451898508277},
 ('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour'): {'rmse': 2.2420682108127896, 'r2': 0.5246583190858984},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.2435580727162483, 'r2': 0.5240263768848739},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.2410026762981254, 'r2': 0.5251100204305661},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.236321526922017, 'r2': 0.5270919092972276},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.236987550920448, 'r2': 0.526810183225404},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.8653767373336874, 'r2': 0.22362468774772104},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.858957058027856, '

### Multivaraite simulation with 6 variables 

In [40]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 08:50:20.030527
run_multiple_imputation iteration 0 stopped 2023-03-21 08:55:11.995899
run_multiple_imputation iteration 1 started 2023-03-21 08:55:12.145232
run_multiple_imputation iteration 1 stopped 2023-03-21 08:59:37.065620
run_multiple_imputation iteration 2 started 2023-03-21 08:59:37.302143
run_multiple_imputation iteration 2 stopped 2023-03-21 09:04:13.584673
run_multiple_imputation iteration 3 started 2023-03-21 09:04:13.716662
run_multiple_imputation iteration 3 stopped 2023-03-21 09:09:06.160816
run_multiple_imputation iteration 4 started 2023-03-21 09:09:06.304646
run_multiple_imputation iteration 4 stopped 2023-03-21 09:13:44.837924
run_multiple_imputation iteration 5 started 2023-03-2

In [41]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 2.752411691378843, 'r2': 0.28363387129343054},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.2448394183115483, 'r2': 0.5234825435223556},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.2415073445440856, 'r2': 0.5248961082552881},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236394681893628, 'r2': 0.5270609690738456},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.864364925303264, 'r2': 0.22417289292869436},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.8752119652498918, 'r2': 0.2182858205767204},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.8368018589505852, 'r2': 0.23903223334771195}}

### Multivaraite simulation with 7 variables 

In [42]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 11:41:04.648207
run_multiple_imputation iteration 0 stopped 2023-03-21 11:43:59.640793
run_multiple_imputation iteration 1 started 2023-03-21 11:43:59.774119
run_multiple_imputation iteration 1 stopped 2023-03-21 11:46:51.879401
run_multiple_imputation iteration 2 started 2023-03-21 11:46:52.016663
run_multiple_imputation iteration 2 stopped 2023-03-21 11:49:44.541189
run_multiple_imputation iteration 3 started 2023-03-21 11:49:44.680204
run_multiple_imputation iteration 3 stopped 2023-03-21 11:52:39.856553
run_multiple_imputation iteration 4 started 2023-03-21 11:52:39.989726
run_multiple_imputation iteration 4 stopped 2023-03-21 11:55:51.004614
run_multiple_imputation iteration 5 started 2023-03-2

In [43]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 2.7555484953920373, 'r2': 0.2820001181629037},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.2449417288047115, 'r2': 0.5234391071441902},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.241939308859636, 'r2': 0.524712974612825},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.237358057216768, 'r2': 0.5266534239039014},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.8672796145955775, 'r2': 0.222593174189897},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.8601096438803566, 'r2': 0.22647630783632966},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_

### with all the variables

In [45]:
results ={}
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']
multiple_predictions = []
X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
for i in range(6):
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=i
    )
    multiple_predictions.append(y_predict)
    del X_train_data
    del X_test_data
predictions_average = np.mean(multiple_predictions, axis=0)
rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
r2 = r2_score(y_test, predictions_average)
print("results are saving for the columns", cols, " = ", rmse, r2)
results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-21 14:56:01.540835
run_multiple_imputation iteration 0 stopped 2023-03-21 14:59:30.516752
run_multiple_imputation iteration 1 started 2023-03-21 14:59:30.619599
run_multiple_imputation iteration 1 stopped 2023-03-21 15:02:28.991506
run_multiple_imputation iteration 2 started 2023-03-21 15:02:29.107192
run_multiple_imputation iteration 2 stopped 2023-03-21 15:05:26.059500
run_multiple_imputation iteration 3 started 2023-03-21 15:05:26.174822
run_multiple_imputation iteration 3 stopped 2023-03-21 15:08:28.866334
run_multiple_imputation iteration 4 started 2023-03-21 15:08:28.974479
run_multiple_imputation iteration 4 stopped 2023-03-21 15:11:33.233139
run_multiple_imputation iteration 5 started 2023-03-2

In [46]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.757731871676201, 'r2': 0.2808618438840308}}

### Test Ignore

In [None]:
# # Execute the simulation
# print("Executing MCAR Missingness...MI", datetime.datetime.now())

# X = df.drop('tripduration', axis=1)
# y = df['tripduration']
# cols = ['dist','birthyear']
# rmse, r2 = perform_simulation(X, y, cols, n_simulation=N_SIM)
# print(rmse, r2)
# print("stooped MCAR Missingness...MI", datetime.datetime.now())

In [None]:
# # Plot results
# n_situations = 3
# n = np.arange(n_situations)
# n_labels = ["Full Data", "Single Imputation", "Multiple Imputations"]
# colors = ["r", "orange", "green"]

# plt.figure(figsize=(12, 6))
# ax1 = plt.subplot(111)
# for j in n:
#     ax1.barh(
#         j,
#         mse_means[j],
#         xerr=mse_std[j],
#         color=colors[j],
#         alpha=0.6,
#         align="center",
#     )

# ax1.set_title("MCAR Missingness")
# ax1.set_yticks(n)
# ax1.set_xlabel("Mean Squared Error")
# ax1.invert_yaxis()
# ax1.set_yticklabels(n_labels)
# plt.show()