In [9]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [10]:
#!pip install category_encoders

In [11]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [12]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,others,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [13]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [14]:
df = tripdata[selected_features]
df.shape

(315433, 22)

In [15]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [16]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [17]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=10, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=10, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [18]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [19]:
def model_evaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [20]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=True, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multivariate simulation with 2 variables

In [21]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 14:28:28.917099
run_multiple_imputation iteration 0 stopped 2023-03-22 14:31:21.102270
run_multiple_imputation iteration 1 started 2023-03-22 14:31:21.387059
run_multiple_imputation iteration 1 stopped 2023-03-22 14:33:42.152547
run_multiple_imputation iteration 2 started 2023-03-22 14:33:42.342478
run_multiple_imputation iteration 2 stopped 2023-03-22 14:36:49.109174
run_multiple_imputation iteration 3 started 2023-03-22 14:36:49.723913
run_multiple_imputation iteration 3 stopped 2023-03-22 14:40:31.929847
run_multiple_imputation iteration 4 started 2023-03-22 14:40:32.157697
run_multiple_imputation iteration 4 stopped 2023-03-22 14:43:47.256675
run_multiple_imputation iteration 5 started 2023-03

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 16:43:03.846861
run_multiple_imputation iteration 0 stopped 2023-03-22 16:46:50.135726
run_multiple_imputation iteration 1 started 2023-03-22 16:46:50.973459
run_multiple_imputation iteration 1 stopped 2023-03-22 16:50:55.883410
run_multiple_imputation iteration 2 started 2023-03-22 16:50:56.374003
run_multiple_imputation iteration 2 stopped 2023-03-22 16:53:55.054639
run_multiple_imputation iteration 3 started 2023-03-22 16:53:55.678482
run_multiple_imputation iteration 3 stopped 2023-03-22 16:57:40.352068
run_multiple_imputation iteration 4 started 2023-03-22 16:57:41.228744
run_multiple_imputation iteration 4 stopped 2023-03-22 17:02:31.638884
run_multiple_imputation iteration 5 started 2023-03

In [22]:
results

{('dist', 'birthyear'): {'rmse': 3.0830516978832354,
  'r2': 0.10118591413985945},
 ('start_lat', 'start_lon'): {'rmse': 2.2384371056673764,
  'r2': 0.5261967361106782},
 ('end_lat', 'end_lon'): {'rmse': 2.238904671115169, 'r2': 0.5259987790881337},
 ('hour', 'temp'): {'rmse': 2.241703727814988, 'r2': 0.5248128548644653},
 ('feelslike', 'dew'): {'rmse': 2.2361084491082757, 'r2': 0.5271820228344153},
 ('snowdepth', 'winddir'): {'rmse': 2.2362877305062345,
  'r2': 0.5271062028390993},
 ('sealevelpressure', 'visibility'): {'rmse': 2.2359443483613655,
  'r2': 0.5272514174437914},
 ('solarradiation', 'dist'): {'rmse': 3.0824122931866986,
  'r2': 0.10155869176994936}}

### Multivariate simulation with 3 variables

In [23]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 17:07:11.565581
run_multiple_imputation iteration 0 stopped 2023-03-22 17:10:52.795912
run_multiple_imputation iteration 1 started 2023-03-22 17:10:53.672133
run_multiple_imputation iteration 1 stopped 2023-03-22 17:15:13.701341
run_multiple_imputation iteration 2 started 2023-03-22 17:15:14.839103
run_multiple_imputation iteration 2 stopped 2023-03-22 17:19:59.248105
run_multiple_imputation iteration 3 started 2023-03-22 17:20:00.242392
run_multiple_imputation iteration 3 stopped 2023-03-22 17:24:53.488693
run_multiple_imputation iteration 4 started 2023-03-22 17:24:54.451536
run_multiple_imputation iteration 4 stopped 2023-03-22 17:29:29.980001
run_multiple_imputation iteration 5 started 2023-03

run_multiple_imputation iteration 5 stopped 2023-03-22 19:13:07.460212
results are saving for the columns ['sealevelpressure', 'visibility', 'solarradiation']  =  2.2373095199407946 0.5266739612551863
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 19:13:17.244051
run_multiple_imputation iteration 0 stopped 2023-03-22 19:15:34.177684
run_multiple_imputation iteration 1 started 2023-03-22 19:15:34.300874
run_multiple_imputation iteration 1 stopped 2023-03-22 19:17:41.178486
run_multiple_imputation iteration 2 started 2023-03-22 19:17:41.327078
run_multiple_imputation iteration 2 stopped 2023-03-22 19:19:46.728602
run_multiple_imputation iteration 3 started 2023-03-22 19:19:46.850477
run_multiple_imputation iteration 3 stopped 2023-03-22 19:21:51

In [24]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 3.099527130226214,
  'r2': 0.09155395264352251},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.239250602773203,
  'r2': 0.5258522925512303},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.2460192195997104,
  'r2': 0.522981533393369},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2440655963175455,
  'r2': 0.5238110090071766},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.236401179899966,
  'r2': 0.5270582207528002},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.236407126881743,
  'r2': 0.5270557054796841},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2373095199407946, 'r2': 0.5266739612551863},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 3.0840296245716394,
  'r2': 0.10061562614505648}}

### Multivariate simulation with 4 variables

In [25]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
results ={}
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 19:26:21.735180
run_multiple_imputation iteration 0 stopped 2023-03-22 19:29:16.718271
run_multiple_imputation iteration 1 started 2023-03-22 19:29:17.006986
run_multiple_imputation iteration 1 stopped 2023-03-22 19:32:31.044750
run_multiple_imputation iteration 2 started 2023-03-22 19:32:31.482991
run_multiple_imputation iteration 2 stopped 2023-03-22 19:35:05.520104
run_multiple_imputation iteration 3 started 2023-03-22 19:35:05.866129
run_multiple_imputation iteration 3 stopped 2023-03-22 19:37:17.187168
run_multiple_imputation iteration 4 started 2023-03-22 19:37:17.375412
run_multiple_imputation iteration 4 stopped 2023-03-22 19:40:03.489746
run_multiple_imputation iteration 5 started 2023-03

run_multiple_imputation iteration 5 stopped 2023-03-22 21:04:24.523778
results are saving for the columns ['sealevelpressure', 'visibility', 'solarradiation', 'dist']  =  3.0843162213422732 0.1004484600245934
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 21:04:36.478986
run_multiple_imputation iteration 0 stopped 2023-03-22 21:06:46.983778
run_multiple_imputation iteration 1 started 2023-03-22 21:06:47.103581
run_multiple_imputation iteration 1 stopped 2023-03-22 21:08:56.534188
run_multiple_imputation iteration 2 started 2023-03-22 21:08:56.660734
run_multiple_imputation iteration 2 stopped 2023-03-22 21:11:05.886219
run_multiple_imputation iteration 3 started 2023-03-22 21:11:05.998798
run_multiple_imputation iteration 3 stopped 2023-03-22 

In [26]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 3.117604407108891,
  'r2': 0.08092644755968459},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2399308065722843,
  'r2': 0.5255641907585354},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.246730742399366,
  'r2': 0.5226792534545897},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.244602441543207,
  'r2': 0.5235831454679358},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.236475376641569,
  'r2': 0.5270268387973591},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2362735879977413, 'r2': 0.5271121840760429},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.0843162213422732, 'r2': 0.1004484600245934},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.097348492572279, 'r2': 0.0928305855669439}}

### Multivaraite simulation with 5 variables 

In [27]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 21:18:02.705360
run_multiple_imputation iteration 0 stopped 2023-03-22 21:20:11.445762
run_multiple_imputation iteration 1 started 2023-03-22 21:20:11.597659
run_multiple_imputation iteration 1 stopped 2023-03-22 21:22:20.246373
run_multiple_imputation iteration 2 started 2023-03-22 21:22:20.377963
run_multiple_imputation iteration 2 stopped 2023-03-22 21:24:29.107046
run_multiple_imputation iteration 3 started 2023-03-22 21:24:29.243843
run_multiple_imputation iteration 3 stopped 2023-03-22 21:26:38.809831
run_multiple_imputation iteration 4 started 2023-03-22 21:26:38.927368
run_multiple_imputation iteration 4 stopped 2023-03-22 21:28:46.717969
run_multiple_imputation iteration 5 started 2023-03

In [28]:
results ={}
col_list = [
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 22:11:01.970618
run_multiple_imputation iteration 0 stopped 2023-03-22 22:13:06.077332
run_multiple_imputation iteration 1 started 2023-03-22 22:13:06.235404
run_multiple_imputation iteration 1 stopped 2023-03-22 22:15:19.293020
run_multiple_imputation iteration 2 started 2023-03-22 22:15:19.432576
run_multiple_imputation iteration 2 stopped 2023-03-22 22:17:30.822913
run_multiple_imputation iteration 3 started 2023-03-22 22:17:30.965249
run_multiple_imputation iteration 3 stopped 2023-03-22 22:19:36.471565
run_multiple_imputation iteration 4 started 2023-03-22 22:19:36.586148
run_multiple_imputation iteration 4 stopped 2023-03-22 22:21:41.003627
run_multiple_imputation iteration 5 started 2023-03

In [29]:
results

{('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2365370897757493, 'r2': 0.5270007360651933},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2376937687366327, 'r2': 0.5265113636557508},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.0844950322888436, 'r2': 0.10034415533377816},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.1170416769216525, 'r2': 0.08125820469629863}}

### Multivaraite simulation with 6 variables 

In [30]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 23:03:08.653378
run_multiple_imputation iteration 0 stopped 2023-03-22 23:05:12.820341
run_multiple_imputation iteration 1 started 2023-03-22 23:05:12.941275
run_multiple_imputation iteration 1 stopped 2023-03-22 23:07:16.552764
run_multiple_imputation iteration 2 started 2023-03-22 23:07:16.666269
run_multiple_imputation iteration 2 stopped 2023-03-22 23:09:18.587942
run_multiple_imputation iteration 3 started 2023-03-22 23:09:18.708513
run_multiple_imputation iteration 3 stopped 2023-03-22 23:11:23.390049
run_multiple_imputation iteration 4 started 2023-03-22 23:11:23.551585
run_multiple_imputation iteration 4 stopped 2023-03-22 23:13:27.135808
run_multiple_imputation iteration 5 started 2023-03

run_multiple_imputation iteration 4 stopped 2023-03-23 00:32:32.251059
run_multiple_imputation iteration 5 started 2023-03-23 00:32:32.370626
run_multiple_imputation iteration 5 stopped 2023-03-23 00:34:35.780209
results are saving for the columns ['solarradiation', 'dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat']  =  3.1130625091997195 0.08360241089466758


In [31]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 3.0509802318593597, 'r2': 0.11978849068579767},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.2507673335528726, 'r2': 0.5209625550411665},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.245144999022262, 'r2': 0.5233528019891457},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236578277425693, 'r2': 0.5269833145723135},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.0839945409952536, 'r2': 0.10063608862093776},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.1003081435294484, 'r2': 0.09109607774275397},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 3.1130625091997195, 'r2': 0.08360241089466758}}

### Multivaraite simulation with 7 variables 

In [32]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 00:34:55.577442
run_multiple_imputation iteration 0 stopped 2023-03-23 00:37:00.440865
run_multiple_imputation iteration 1 started 2023-03-23 00:37:00.552424
run_multiple_imputation iteration 1 stopped 2023-03-23 00:39:02.957193
run_multiple_imputation iteration 2 started 2023-03-23 00:39:03.077708
run_multiple_imputation iteration 2 stopped 2023-03-23 00:41:06.487339
run_multiple_imputation iteration 3 started 2023-03-23 00:41:06.627959
run_multiple_imputation iteration 3 stopped 2023-03-23 00:43:09.810194
run_multiple_imputation iteration 4 started 2023-03-23 00:43:09.926761
run_multiple_imputation iteration 4 stopped 2023-03-23 00:45:20.730915
run_multiple_imputation iteration 5 started 2023-03

run_multiple_imputation iteration 3 stopped 2023-03-23 02:05:04.268901
run_multiple_imputation iteration 4 started 2023-03-23 02:05:04.386320
run_multiple_imputation iteration 4 stopped 2023-03-23 02:07:09.075888
run_multiple_imputation iteration 5 started 2023-03-23 02:07:09.196501
run_multiple_imputation iteration 5 stopped 2023-03-23 02:09:13.985160
results are saving for the columns ['solarradiation', 'dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lat']  =  3.1127746717314886 0.08377186548592808


In [33]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 3.0487864581426574, 'r2': 0.1210538483747251},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.250768083511871, 'r2': 0.5209622358091456},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2451251346715497, 'r2': 0.5233612364067226},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.237951122943272, 'r2': 0.5264024467867741},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.0852027376729154, 'r2': 0.09993127457402895},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.1199323610137046, 'r2': 0.07955336784587574},
 ('solarradiation',
  'dist',
  'birthyear',
  'sta

### with all the variables

In [34]:
results ={}
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']
multiple_predictions = []
X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
for i in range(6):
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=i
    )
    multiple_predictions.append(y_predict)
    del X_train_data
    del X_test_data
predictions_average = np.mean(multiple_predictions, axis=0)
rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
r2 = r2_score(y_test, predictions_average)
print("results are saving for the columns", cols, " = ", rmse, r2)
results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 25235 for the train
Train set after Amputation (252346, 21)
number of missing values produced 6309 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 02:09:58.672837
run_multiple_imputation iteration 0 stopped 2023-03-23 02:12:09.097045
run_multiple_imputation iteration 1 started 2023-03-23 02:12:09.229610
run_multiple_imputation iteration 1 stopped 2023-03-23 02:14:13.748966
run_multiple_imputation iteration 2 started 2023-03-23 02:14:13.868506
run_multiple_imputation iteration 2 stopped 2023-03-23 02:16:37.240767
run_multiple_imputation iteration 3 started 2023-03-23 02:16:37.419314
run_multiple_imputation iteration 3 stopped 2023-03-23 02:18:44.927797
run_multiple_imputation iteration 4 started 2023-03-23 02:18:45.034338
run_multiple_imputation iteration 4 stopped 2023-03-23 02:20:56.242958
run_multiple_imputation iteration 5 started 2023-03

In [35]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.0560953587680664, 'r2': 0.11683457588950075}}

### Test Ignore

In [36]:
# # Execute the simulation
# print("Executing MCAR Missingness...MI", datetime.datetime.now())

# X = df.drop('tripduration', axis=1)
# y = df['tripduration']
# cols = ['dist','birthyear']
# rmse, r2 = perform_simulation(X, y, cols, n_simulation=N_SIM)
# print(rmse, r2)
# print("stooped MCAR Missingness...MI", datetime.datetime.now())

In [37]:
# # Plot results
# n_situations = 3
# n = np.arange(n_situations)
# n_labels = ["Full Data", "Single Imputation", "Multiple Imputations"]
# colors = ["r", "orange", "green"]

# plt.figure(figsize=(12, 6))
# ax1 = plt.subplot(111)
# for j in n:
#     ax1.barh(
#         j,
#         mse_means[j],
#         xerr=mse_std[j],
#         color=colors[j],
#         alpha=0.6,
#         align="center",
#     )

# ax1.set_title("MCAR Missingness")
# ax1.set_yticks(n)
# ax1.set_xlabel("Mean Squared Error")
# ax1.invert_yaxis()
# ax1.set_yticklabels(n_labels)
# plt.show()