In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [2]:
#!pip install category_encoders

In [5]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [6]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,others,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [7]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [8]:
df = tripdata[selected_features]
df.shape

(315433, 22)

In [9]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [10]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df ,cols_list, perc,name):
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    i=0
    while(i < sample_size):
        idx = random.choice(index)
        col = random.choice(cols_list)
        df.loc[idx,col] = np.NaN
        i = i+1
    return df

In [11]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=20, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=20, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [12]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [13]:
def model_evaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [14]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=10, sample_posterior=True, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Test

In [18]:
results ={}
col_list = ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour', 'temp', 'feelslike', 'dew', 'snowdepth', 'winddir',
            'sealevelpressure', 'visibility', 'solarradiation', 'birthyear']

multiple_predictions = []
X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(col_list, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
for i in range(5):
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=i
    )
    multiple_predictions.append(y_predict)
    del X_train_data
    del X_test_data
predictions_average = np.mean(multiple_predictions, axis=0)
rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
r2 = r2_score(y_test, predictions_average)
print("results are saving for the columns", cols, " = ", rmse, r2)
results[tuple(col_list)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-26 19:48:28.554655
run_multiple_imputation iteration 0 stopped 2023-03-26 19:54:33.620915
run_multiple_imputation iteration 1 started 2023-03-26 19:54:33.764511
run_multiple_imputation iteration 1 stopped 2023-03-26 20:00:34.934166
run_multiple_imputation iteration 2 started 2023-03-26 20:00:35.073557
run_multiple_imputation iteration 2 stopped 2023-03-26 20:06:34.514095
run_multiple_imputation iteration 3 started 2023-03-26 20:06:34.654729
run_multiple_imputation iteration 3 stopped 2023-03-26 20:12:29.938574
run_multiple_imputation iteration 4 started 2023-03-26 20:12:30.080990
run_multiple_imputation iteration 4 stopped 2023-03-26 20:18:31.137933
results are saving for the columns start_lat  =  2

In [20]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear'): {'rmse': 2.246407948160509, 'r2': 0.5228163996785375}}

### Multivariate simulation with 2 variables

In [22]:
results ={}
col_list = [['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','birthyear']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-26 20:53:51.659061
run_multiple_imputation iteration 0 stopped 2023-03-26 20:59:46.221573
run_multiple_imputation iteration 1 started 2023-03-26 20:59:46.364654
run_multiple_imputation iteration 1 stopped 2023-03-26 21:05:47.181655
run_multiple_imputation iteration 2 started 2023-03-26 21:05:47.327508
run_multiple_imputation iteration 2 stopped 2023-03-26 21:11:37.021300
run_multiple_imputation iteration 3 started 2023-03-26 21:11:37.163419
run_multiple_imputation iteration 3 stopped 2023-03-26 21:17:32.314202
run_multiple_imputation iteration 4 started 2023-03-26 21:17:32.457781
run_multiple_imputation iteration 4 stopped 2023-03-26 21:23:25.755769
run_multiple_imputation iteration 5 started 2023-0

In [23]:
results

{('start_lat', 'start_lon'): {'rmse': 2.238504279388437,
  'r2': 0.5261682987641393},
 ('end_lat', 'end_lon'): {'rmse': 2.2388006569048295,
  'r2': 0.5260428200141115},
 ('hour', 'temp'): {'rmse': 2.241735904765546, 'r2': 0.5247992132910315},
 ('feelslike', 'dew'): {'rmse': 2.2361228766085035, 'r2': 0.5271759215172073},
 ('snowdepth', 'winddir'): {'rmse': 2.236289833169297,
  'r2': 0.5271053135646347},
 ('sealevelpressure', 'visibility'): {'rmse': 2.2360137986437674,
  'r2': 0.527222049063053},
 ('solarradiation', 'birthyear'): {'rmse': 2.2400765171419,
  'r2': 0.525502463351315}}

### Multivariate simulation with 3 variables

In [24]:
results ={}
col_list = [['start_lat','start_lon','end_lat'],['end_lat','end_lon','hour'], ['hour','temp','feelslike'], 
            ['feelslike', 'dew','snowdepth'], ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
             ['solarradiation','birthyear','start_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-27 01:01:11.304006
run_multiple_imputation iteration 0 stopped 2023-03-27 01:06:48.928168
run_multiple_imputation iteration 1 started 2023-03-27 01:06:49.062289
run_multiple_imputation iteration 1 stopped 2023-03-27 01:12:30.983614
run_multiple_imputation iteration 2 started 2023-03-27 01:12:31.168347
run_multiple_imputation iteration 2 stopped 2023-03-27 01:18:25.578489
run_multiple_imputation iteration 3 started 2023-03-27 01:18:25.757425
run_multiple_imputation iteration 3 stopped 2023-03-27 01:24:15.453816
run_multiple_imputation iteration 4 started 2023-03-27 01:24:15.611389
run_multiple_imputation iteration 4 stopped 2023-03-27 01:30:09.303736
run_multiple_imputation iteration 5 started 2023-0

In [25]:
results

{('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.239338733826985,
  'r2': 0.5258149693866231},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.2457860729146955,
  'r2': 0.523080561487957},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2438630276825378,
  'r2': 0.5238969749436202},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.2364239892555196,
  'r2': 0.5270485735103398},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.2363910066595345,
  'r2': 0.5270625235044983},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2372695162370517, 'r2': 0.5266908875009106},
 ('solarradiation', 'birthyear', 'start_lat'): {'rmse': 2.2400657292780073,
  'r2': 0.5255070335545897}}

### Multivariate simulation with 4 variables

In [15]:
results ={}
col_list = [['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','birthyear'],
            ['solarradiation','birthyear','start_lat','start_lon']]
results ={}
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-28 10:36:58.854431
run_multiple_imputation iteration 0 stopped 2023-03-28 10:41:53.928718
run_multiple_imputation iteration 1 started 2023-03-28 10:41:54.158206
run_multiple_imputation iteration 1 stopped 2023-03-28 10:47:27.468592
run_multiple_imputation iteration 2 started 2023-03-28 10:47:27.817723
run_multiple_imputation iteration 2 stopped 2023-03-28 10:53:44.801065
run_multiple_imputation iteration 3 started 2023-03-28 10:53:45.149870
run_multiple_imputation iteration 3 stopped 2023-03-28 10:59:52.426648
run_multiple_imputation iteration 4 started 2023-03-28 10:59:52.652473
run_multiple_imputation iteration 4 stopped 2023-03-28 11:05:34.976500
run_multiple_imputation iteration 5 started 2023-0

run_multiple_imputation iteration 5 stopped 2023-03-28 14:30:15.563244
results are saving for the columns ['solarradiation', 'birthyear', 'start_lat', 'start_lon']  =  2.242170673159134 0.5246148719359808


In [16]:
results

{('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.239657186305425,
  'r2': 0.5256800937351063},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.2463105052550283,
  'r2': 0.5228577965655288},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.243979192461473,
  'r2': 0.5238476779646335},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.236450479770461,
  'r2': 0.5270373691939028},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2363192015194864, 'r2': 0.5270928927882739},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear'): {'rmse': 2.2398174198201506, 'r2': 0.525612222040382},
 ('solarradiation',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.242170673159134, 'r2': 0.5246148719359808}}

### Multivaraite simulation with 5 variables 

In [13]:
results ={}
col_list = [ ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth'],['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','birthyear','start_lat'],
            ['solarradiation','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-27 14:42:03.818614
run_multiple_imputation iteration 0 stopped 2023-03-27 14:46:48.423015
run_multiple_imputation iteration 1 started 2023-03-27 14:46:48.662747
run_multiple_imputation iteration 1 stopped 2023-03-27 14:51:11.712170
run_multiple_imputation iteration 2 started 2023-03-27 14:51:11.861449
run_multiple_imputation iteration 2 stopped 2023-03-27 14:55:35.205471
run_multiple_imputation iteration 3 started 2023-03-27 14:55:35.331143
run_multiple_imputation iteration 3 stopped 2023-03-27 15:00:02.671353
run_multiple_imputation iteration 4 started 2023-03-27 15:00:02.794231
run_multiple_imputation iteration 4 stopped 2023-03-27 15:04:22.550122
run_multiple_imputation iteration 5 started 2023-0

In [14]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour'): {'rmse': 2.24525973338791, 'r2': 0.5233040842348003},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.2484393825730082, 'r2': 0.5219529719217019},
 ('hour', 'temp', 'feelslike', 'dew', 'snowdepth'): {'rmse': 2.243979104491034,
  'r2': 0.5238477152977141},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2365405459812067, 'r2': 0.5269992741767158},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2376298608731187, 'r2': 0.5265384086566947},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear',
  'start_lat'): {'rmse': 2.239979358440827, 'r2': 0.5255436231636608},
 ('solarradiation',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.242392688040208, 'r2': 0.5245207240646939}}

### Multivaraite simulation with 6 variables 

In [15]:
results ={}
col_list = [ ['start_lat','start_lon','end_lat','end_lon','hour','temp'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','birthyear'],
            ['sealevelpressure','visibility','solarradiation','birthyear','start_lat','start_lon'],
            ['solarradiation','birthyear','start_lat','start_lon','end_lat','hour']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-27 17:58:02.408399
run_multiple_imputation iteration 0 stopped 2023-03-27 18:02:22.395023
run_multiple_imputation iteration 1 started 2023-03-27 18:02:22.518424
run_multiple_imputation iteration 1 stopped 2023-03-27 18:06:40.969881
run_multiple_imputation iteration 2 started 2023-03-27 18:06:41.094378
run_multiple_imputation iteration 2 stopped 2023-03-27 18:10:53.400232
run_multiple_imputation iteration 3 started 2023-03-27 18:10:53.523214
run_multiple_imputation iteration 3 stopped 2023-03-27 18:15:24.608887
run_multiple_imputation iteration 4 started 2023-03-27 18:15:24.731821
run_multiple_imputation iteration 4 stopped 2023-03-27 18:19:41.314819
run_multiple_imputation iteration 5 started 2023-0

In [16]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp'): {'rmse': 2.2444289230807586, 'r2': 0.5236568012060494},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.248214269314493, 'r2': 0.5220486910572649},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.2443160467638705, 'r2': 0.5237047122922602},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236537665573059, 'r2': 0.5270004925174873},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear'): {'rmse': 2.23997315296416, 'r2': 0.5255462519592224},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.2419093077562664, 'r2': 0.5247256948855963},
 ('solarradiation',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'hour'): {'rmse': 2.246977977423609, 'r2': 0.5225741968964766}}

### Multivaraite simulation with 7 variables 

In [18]:
results ={}
col_list = [['start_lat','start_lon','end_lat','end_lon','hour','temp','feelslike'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','birthyear','start_lat'],
            ['sealevelpressure','visibility','solarradiation','birthyear','start_lat','start_lon','end_lat'],
            ['solarradiation','birthyear','start_lat','start_lon','end_lat','hour']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-27 22:44:33.740766
run_multiple_imputation iteration 0 stopped 2023-03-27 22:49:35.656373
run_multiple_imputation iteration 1 started 2023-03-27 22:49:35.777470
run_multiple_imputation iteration 1 stopped 2023-03-27 22:54:57.900407
run_multiple_imputation iteration 2 started 2023-03-27 22:54:58.018857
run_multiple_imputation iteration 2 stopped 2023-03-27 23:00:36.413256
run_multiple_imputation iteration 3 started 2023-03-27 23:00:36.532481
run_multiple_imputation iteration 3 stopped 2023-03-27 23:06:07.640955
run_multiple_imputation iteration 4 started 2023-03-27 23:06:07.759598
run_multiple_imputation iteration 4 stopped 2023-03-27 23:11:30.226037
run_multiple_imputation iteration 5 started 2023-0

In [19]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.246104366052351, 'r2': 0.5229453652254349},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.248336220343698, 'r2': 0.5219968381394471},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.243420398803492, 'r2': 0.5240847904746904},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.237835650341336, 'r2': 0.5264513183910824},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear',
  'start_lat'): {'rmse': 2.2398454101406835, 'r2': 0.5256003654052726},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.241926952915489, 'r2': 0.5247182134734252},
 ('solarradiation',
  'birthyear',
  'start_lat',
 