In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [2]:
#!pip install category_encoders

In [3]:
tripdata = pd.read_csv("/home/jovyan/data/final_bike_sharing.csv")

In [4]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,ss_3273,Manila & 1st,40.721651,-74.042884,es_3273,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,ss_3198,Heights Elevator,40.748716,-74.040443,es_3198,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,ss_3213,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717733,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,ss_3272,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [5]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [6]:
df = tripdata[selected_features]
df.shape

(315433, 22)

In [7]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [8]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [9]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=40, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=40, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [10]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [11]:
def model_evaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [12]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=True, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multivariate simulation with 2 variables

In [13]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 17:18:31.853095
run_multiple_imputation iteration 0 stopped 2023-03-23 17:21:38.747051
run_multiple_imputation iteration 1 started 2023-03-23 17:21:38.898202
run_multiple_imputation iteration 1 stopped 2023-03-23 17:24:49.048782
run_multiple_imputation iteration 2 started 2023-03-23 17:24:49.196254
run_multiple_imputation iteration 2 stopped 2023-03-23 17:27:54.236480
run_multiple_imputation iteration 3 started 2023-03-23 17:27:54.387384
run_multiple_imputation iteration 3 stopped 2023-03-23 17:30:56.774101
run_multiple_imputation iteration 4 started 2023-03-23 17:30:56.922955
run_multiple_imputation iteration 4 stopped 2023-03-23 17:34:02.209221
run_multiple_imputation iteration 5 started 2023-

number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 19:47:49.779318
run_multiple_imputation iteration 0 stopped 2023-03-23 19:50:46.763363
run_multiple_imputation iteration 1 started 2023-03-23 19:50:46.907769
run_multiple_imputation iteration 1 stopped 2023-03-23 19:53:47.915345
run_multiple_imputation iteration 2 started 2023-03-23 19:53:48.055171
run_multiple_imputation iteration 2 stopped 2023-03-23 19:56:53.792158
run_multiple_imputation iteration 3 started 2023-03-23 19:56:53.931734
run_multiple_imputation iteration 3 stopped 2023-03-23 19:59:54.305259
run_multiple_imputation iteration 4 started 2023-03-23 19:59:54.450008
run_multiple_imputation iteration 4 stopped 2023-03-23 20:02:55.327752
run_multiple_imputation iteration 5 started 2023-03-23 20:02:55.468330
run_multiple_imputation iteration

In [14]:
results

{('dist', 'birthyear'): {'rmse': 3.104780141742423, 'r2': 0.08847211392759047},
 ('start_lat', 'start_lon'): {'rmse': 2.238689016167072,
  'r2': 0.5260900878311809},
 ('end_lat', 'end_lon'): {'rmse': 2.2389429349585415, 'r2': 0.525982577180393},
 ('hour', 'temp'): {'rmse': 2.241956432137498, 'r2': 0.5247057143780387},
 ('feelslike', 'dew'): {'rmse': 2.2361590730388405, 'r2': 0.5271606140557835},
 ('snowdepth', 'winddir'): {'rmse': 2.2363388498783827,
  'r2': 0.5270845828033845},
 ('sealevelpressure', 'visibility'): {'rmse': 2.2360037178133485,
  'r2': 0.5272263119916705},
 ('solarradiation', 'dist'): {'rmse': 3.106421531315147,
  'r2': 0.08750807281270201}}

### Multivariate simulation with 3 variables

In [15]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 20:10:48.313899
run_multiple_imputation iteration 0 stopped 2023-03-23 20:13:52.468292
run_multiple_imputation iteration 1 started 2023-03-23 20:13:52.614008
run_multiple_imputation iteration 1 stopped 2023-03-23 20:16:56.932486
run_multiple_imputation iteration 2 started 2023-03-23 20:16:57.080546
run_multiple_imputation iteration 2 stopped 2023-03-23 20:19:59.465757
run_multiple_imputation iteration 3 started 2023-03-23 20:19:59.614057
run_multiple_imputation iteration 3 stopped 2023-03-23 20:23:03.717447
run_multiple_imputation iteration 4 started 2023-03-23 20:23:03.861165
run_multiple_imputation iteration 4 stopped 2023-03-23 20:26:07.772150
run_multiple_imputation iteration 5 started 2023-

run_multiple_imputation iteration 5 stopped 2023-03-23 22:49:35.380893
results are saving for the columns ['sealevelpressure', 'visibility', 'solarradiation']  =  2.23736944773578 0.5266486042221508
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 22:54:23.326855
run_multiple_imputation iteration 0 stopped 2023-03-23 22:57:23.589530
run_multiple_imputation iteration 1 started 2023-03-23 22:57:23.712090
run_multiple_imputation iteration 1 stopped 2023-03-23 23:00:23.045008
run_multiple_imputation iteration 2 started 2023-03-23 23:00:23.167175
run_multiple_imputation iteration 2 stopped 2023-03-23 23:03:29.362102
run_multiple_imputation iteration 3 started 2023-03-23 23:03:29.487076
run_multiple_imputation iteration 3 stopped 2023-03-23 23:06:30

In [16]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 3.1214951194137353,
  'r2': 0.07863104274885979},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.239411645991521,
  'r2': 0.5257840902531474},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.246449453390866,
  'r2': 0.5227987663735656},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2444440529695178,
  'r2': 0.52365037904331},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.2364377702455385,
  'r2': 0.527042744777505},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.2363683010163635,
  'r2': 0.5270721267402824},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.23736944773578, 'r2': 0.5266486042221508},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 3.107134363167712,
  'r2': 0.08708924495289794}}

### Multivariate simulation with 4 variables

In [17]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
results ={}
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 23:18:58.807142
run_multiple_imputation iteration 0 stopped 2023-03-23 23:22:14.616768
run_multiple_imputation iteration 1 started 2023-03-23 23:22:14.757980
run_multiple_imputation iteration 1 stopped 2023-03-23 23:25:26.672350
run_multiple_imputation iteration 2 started 2023-03-23 23:25:26.818045
run_multiple_imputation iteration 2 stopped 2023-03-23 23:28:40.433259
run_multiple_imputation iteration 3 started 2023-03-23 23:28:40.582703
run_multiple_imputation iteration 3 stopped 2023-03-23 23:31:49.694856
run_multiple_imputation iteration 4 started 2023-03-23 23:31:49.841691
run_multiple_imputation iteration 4 stopped 2023-03-23 23:34:57.816334
run_multiple_imputation iteration 5 started 2023-

run_multiple_imputation iteration 5 stopped 2023-03-24 02:11:26.040602
results are saving for the columns ['sealevelpressure', 'visibility', 'solarradiation', 'dist']  =  3.1067226429001193 0.08733116494343396
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-24 02:17:44.192022
run_multiple_imputation iteration 0 stopped 2023-03-24 02:20:55.050280
run_multiple_imputation iteration 1 started 2023-03-24 02:20:55.194983
run_multiple_imputation iteration 1 stopped 2023-03-24 02:24:06.124298
run_multiple_imputation iteration 2 started 2023-03-24 02:24:06.268818
run_multiple_imputation iteration 2 stopped 2023-03-24 02:27:16.362766
run_multiple_imputation iteration 3 started 2023-03-24 02:27:16.505498
run_multiple_imputation iteration 3 stopped 2023-03-

In [18]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 3.1457468198241103,
  'r2': 0.06425872168051905},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.240507698286032,
  'r2': 0.5253197784476069},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.247225816951828,
  'r2': 0.5224688718665418},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.245151824764825,
  'r2': 0.5233499037567249},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.236503770911068,
  'r2': 0.5270148289958678},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2363268435417383, 'r2': 0.5270896607158498},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.1067226429001193, 'r2': 0.08733116494343396},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.123340244557492, 'r2': 0.07754147290215097}}

### Multivaraite simulation with 5 variables 

In [19]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-24 02:44:38.712817
run_multiple_imputation iteration 0 stopped 2023-03-24 02:47:52.762621
run_multiple_imputation iteration 1 started 2023-03-24 02:47:52.913213
run_multiple_imputation iteration 1 stopped 2023-03-24 02:51:06.819332
run_multiple_imputation iteration 2 started 2023-03-24 02:51:06.957780
run_multiple_imputation iteration 2 stopped 2023-03-24 02:54:21.489036
run_multiple_imputation iteration 3 started 2023-03-24 02:54:21.631027
run_multiple_imputation iteration 3 stopped 2023-03-24 02:57:34.479315
run_multiple_imputation iteration 4 started 2023-03-24 02:57:34.624794
run_multiple_imputation iteration 4 stopped 2023-03-24 03:00:45.103872
run_multiple_imputation iteration 5 started 2023-

In [20]:
results ={}
col_list = [
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-24 04:35:24.756014
run_multiple_imputation iteration 0 stopped 2023-03-24 04:38:47.109969
run_multiple_imputation iteration 1 started 2023-03-24 04:38:47.254317
run_multiple_imputation iteration 1 stopped 2023-03-24 04:41:59.632580
run_multiple_imputation iteration 2 started 2023-03-24 04:41:59.779219
run_multiple_imputation iteration 2 stopped 2023-03-24 04:45:11.521776
run_multiple_imputation iteration 3 started 2023-03-24 04:45:11.663210
run_multiple_imputation iteration 3 stopped 2023-03-24 04:48:27.143816
run_multiple_imputation iteration 4 started 2023-03-24 04:48:27.286118
run_multiple_imputation iteration 4 stopped 2023-03-24 04:51:44.186763
run_multiple_imputation iteration 5 started 2023-

In [21]:
results

{('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2365234701825076, 'r2': 0.5270064967905513},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2377650851061714, 'r2': 0.5264811825566864},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.1071258510875905, 'r2': 0.08709424683433498},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.146924302510196, 'r2': 0.06355807701478866}}

### Multivaraite simulation with 6 variables 

In [22]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-24 06:27:23.821602
run_multiple_imputation iteration 0 stopped 2023-03-24 06:30:45.927991
run_multiple_imputation iteration 1 started 2023-03-24 06:30:46.075631
run_multiple_imputation iteration 1 stopped 2023-03-24 06:34:02.953943
run_multiple_imputation iteration 2 started 2023-03-24 06:34:03.099011
run_multiple_imputation iteration 2 stopped 2023-03-24 06:37:21.032915
run_multiple_imputation iteration 3 started 2023-03-24 06:37:21.188647
run_multiple_imputation iteration 3 stopped 2023-03-24 06:40:38.929371
run_multiple_imputation iteration 4 started 2023-03-24 06:40:39.076981
run_multiple_imputation iteration 4 stopped 2023-03-24 06:43:53.668470
run_multiple_imputation iteration 5 started 2023-

run_multiple_imputation iteration 3 stopped 2023-03-24 09:38:39.414233
run_multiple_imputation iteration 4 started 2023-03-24 09:38:39.557477
run_multiple_imputation iteration 4 stopped 2023-03-24 09:41:57.016723
run_multiple_imputation iteration 5 started 2023-03-24 09:41:57.175014
run_multiple_imputation iteration 5 stopped 2023-03-24 09:45:18.907022
results are saving for the columns ['solarradiation', 'dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat']  =  3.1492405771975407 0.06217904480786862


In [23]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 3.15007472453001, 'r2': 0.06168217298668899},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.2517308344519336, 'r2': 0.520552337785879},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.2454157901732783, 'r2': 0.5232378164294309},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236526745693729, 'r2': 0.5270051113397098},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.1070574464221754, 'r2': 0.08713444239009649},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.1230120047430487, 'r2': 0.07773534976582674},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 3.1492405771975407, 'r2': 0.06217904480786862}}

### Multivaraite simulation with 7 variables 

In [24]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-24 09:56:24.596598
run_multiple_imputation iteration 0 stopped 2023-03-24 09:59:45.607548
run_multiple_imputation iteration 1 started 2023-03-24 09:59:45.752405
run_multiple_imputation iteration 1 stopped 2023-03-24 10:03:35.951157
run_multiple_imputation iteration 2 started 2023-03-24 10:03:36.093604
run_multiple_imputation iteration 2 stopped 2023-03-24 10:10:34.250776
run_multiple_imputation iteration 3 started 2023-03-24 10:10:34.400391
run_multiple_imputation iteration 3 stopped 2023-03-24 10:17:33.277373
run_multiple_imputation iteration 4 started 2023-03-24 10:17:33.420672
run_multiple_imputation iteration 4 stopped 2023-03-24 10:24:22.455665
run_multiple_imputation iteration 5 started 2023-

run_multiple_imputation iteration 3 started 2023-03-24 14:58:09.790636
run_multiple_imputation iteration 3 stopped 2023-03-24 15:05:09.538366
run_multiple_imputation iteration 4 started 2023-03-24 15:05:09.757601
run_multiple_imputation iteration 4 stopped 2023-03-24 15:11:59.566551
run_multiple_imputation iteration 5 started 2023-03-24 15:11:59.925278
run_multiple_imputation iteration 5 stopped 2023-03-24 15:19:13.078655
results are saving for the columns ['solarradiation', 'dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lat']  =  3.1493794916871463 0.062096307532372164


In [25]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 3.1489876409096627, 'r2': 0.06232968393459026},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.251880483164914, 'r2': 0.5204886080463531},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2454494642681455, 'r2': 0.5232235164895692},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2381236778722884, 'r2': 0.5263294114627501},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.10738721754858, 'r2': 0.08694065602867918},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.145978860778087, 'r2': 0.06412066968875552},
 ('solarradiation',
  'dist',
  'birthyear',
  'star

### with all the variables

In [26]:
results ={}
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']
multiple_predictions = []
X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
for i in range(6):
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=i
    )
    multiple_predictions.append(y_predict)
    del X_train_data
    del X_test_data
predictions_average = np.mean(multiple_predictions, axis=0)
rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
r2 = r2_score(y_test, predictions_average)
print("results are saving for the columns", cols, " = ", rmse, r2)
results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-24 15:50:40.139583
run_multiple_imputation iteration 0 stopped 2023-03-24 15:54:33.265459
run_multiple_imputation iteration 1 started 2023-03-24 15:54:33.411014
run_multiple_imputation iteration 1 stopped 2023-03-24 15:58:29.915389
run_multiple_imputation iteration 2 started 2023-03-24 15:58:30.062511
run_multiple_imputation iteration 2 stopped 2023-03-24 16:02:25.791362
run_multiple_imputation iteration 3 started 2023-03-24 16:02:25.938569
run_multiple_imputation iteration 3 stopped 2023-03-24 16:06:23.157670
run_multiple_imputation iteration 4 started 2023-03-24 16:06:23.304870
run_multiple_imputation iteration 4 stopped 2023-03-24 16:10:19.476320
run_multiple_imputation iteration 5 started 2023-

In [27]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.1539811311630106, 'r2': 0.0593535147380273}}

### Test Ignore

In [28]:
# # Execute the simulation
# print("Executing MCAR Missingness...MI", datetime.datetime.now())

# X = df.drop('tripduration', axis=1)
# y = df['tripduration']
# cols = ['dist','birthyear']
# rmse, r2 = perform_simulation(X, y, cols, n_simulation=N_SIM)
# print(rmse, r2)
# print("stooped MCAR Missingness...MI", datetime.datetime.now())

In [29]:
# # Plot results
# n_situations = 3
# n = np.arange(n_situations)
# n_labels = ["Full Data", "Single Imputation", "Multiple Imputations"]
# colors = ["r", "orange", "green"]

# plt.figure(figsize=(12, 6))
# ax1 = plt.subplot(111)
# for j in n:
#     ax1.barh(
#         j,
#         mse_means[j],
#         xerr=mse_std[j],
#         color=colors[j],
#         alpha=0.6,
#         align="center",
#     )

# ax1.set_title("MCAR Missingness")
# ax1.set_yticks(n)
# ax1.set_xlabel("Mean Squared Error")
# ax1.invert_yaxis()
# ax1.set_yticklabels(n_labels)
# plt.show()