In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [2]:
#!pip install category_encoders

In [3]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [4]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,others,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [5]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [6]:
df = tripdata[selected_features]
df.shape

(315433, 22)

In [7]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [8]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [9]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=30, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=30, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [10]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [11]:
def model_evaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [12]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=True, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multivariate simulation with 2 variables

In [13]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 03:20:29.402638
run_multiple_imputation iteration 0 stopped 2023-03-23 03:22:38.264659
run_multiple_imputation iteration 1 started 2023-03-23 03:22:38.573527
run_multiple_imputation iteration 1 stopped 2023-03-23 03:24:40.562588
run_multiple_imputation iteration 2 started 2023-03-23 03:24:40.683834
run_multiple_imputation iteration 2 stopped 2023-03-23 03:26:47.542877
run_multiple_imputation iteration 3 started 2023-03-23 03:26:47.709634
run_multiple_imputation iteration 3 stopped 2023-03-23 03:28:54.184933
run_multiple_imputation iteration 4 started 2023-03-23 03:28:54.336624
run_multiple_imputation iteration 4 stopped 2023-03-23 03:31:15.905609
run_multiple_imputation iteration 5 started 2023-0

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 04:53:14.876454
run_multiple_imputation iteration 0 stopped 2023-03-23 04:55:21.262816
run_multiple_imputation iteration 1 started 2023-03-23 04:55:21.420520
run_multiple_imputation iteration 1 stopped 2023-03-23 04:57:27.080786
run_multiple_imputation iteration 2 started 2023-03-23 04:57:27.202399
run_multiple_imputation iteration 2 stopped 2023-03-23 04:59:31.085552
run_multiple_imputation iteration 3 started 2023-03-23 04:59:31.193599
run_multiple_imputation iteration 3 stopped 2023-03-23 05:01:54.229638
run_multiple_imputation iteration 4 started 2023-03-23 05:01:54.349212
run_multiple_imputation iteration 4 stopped 2023-03-23 05:03:59.948466
run_multiple_imputation iteration 5 started 2023-0

In [14]:
results

{('dist', 'birthyear'): {'rmse': 3.1024979251248874,
  'r2': 0.08981168672448547},
 ('start_lat', 'start_lon'): {'rmse': 2.238659439217259,
  'r2': 0.5261026100859414},
 ('end_lat', 'end_lon'): {'rmse': 2.2388820308009376,
  'r2': 0.5260083654563843},
 ('hour', 'temp'): {'rmse': 2.2418961498022365, 'r2': 0.5247312737188569},
 ('feelslike', 'dew'): {'rmse': 2.236134752033177, 'r2': 0.5271708994303654},
 ('snowdepth', 'winddir'): {'rmse': 2.2363081651187424,
  'r2': 0.5270975604398933},
 ('sealevelpressure', 'visibility'): {'rmse': 2.2359733441197664,
  'r2': 0.5272391561434822},
 ('solarradiation', 'dist'): {'rmse': 3.1049432711024396,
  'r2': 0.0883763255881439}}

### Multivariate simulation with 3 variables

In [15]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 05:06:28.790821
run_multiple_imputation iteration 0 stopped 2023-03-23 05:08:33.181929
run_multiple_imputation iteration 1 started 2023-03-23 05:08:33.316099
run_multiple_imputation iteration 1 stopped 2023-03-23 05:10:37.322261
run_multiple_imputation iteration 2 started 2023-03-23 05:10:37.441789
run_multiple_imputation iteration 2 stopped 2023-03-23 05:12:43.478733
run_multiple_imputation iteration 3 started 2023-03-23 05:12:43.614839
run_multiple_imputation iteration 3 stopped 2023-03-23 05:14:47.541319
run_multiple_imputation iteration 4 started 2023-03-23 05:14:47.710862
run_multiple_imputation iteration 4 stopped 2023-03-23 05:17:06.646042
run_multiple_imputation iteration 5 started 2023-0

run_multiple_imputation iteration 5 stopped 2023-03-23 06:37:35.117510
results are saving for the columns ['sealevelpressure', 'visibility', 'solarradiation']  =  2.2373429834051297 0.5266598020632387
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 06:38:00.287676
run_multiple_imputation iteration 0 stopped 2023-03-23 06:40:05.847896
run_multiple_imputation iteration 1 started 2023-03-23 06:40:05.963410
run_multiple_imputation iteration 1 stopped 2023-03-23 06:42:15.246416
run_multiple_imputation iteration 2 started 2023-03-23 06:42:15.393470
run_multiple_imputation iteration 2 stopped 2023-03-23 06:44:19.448118
run_multiple_imputation iteration 3 started 2023-03-23 06:44:19.566650
run_multiple_imputation iteration 3 stopped 2023-03-23 06:46:4

In [16]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 3.1193234571704345,
  'r2': 0.07991261204438238},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.239387292154596,
  'r2': 0.5257944044927432},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.2463454419231126,
  'r2': 0.5228429545493578},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2443819440718378,
  'r2': 0.5236767420444095},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.236423099535157,
  'r2': 0.5270489498204072},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.23636632030017,
  'r2': 0.5270729644694541},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2373429834051297, 'r2': 0.5266598020632387},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 3.105519645555142,
  'r2': 0.08803784251824553}}

### Multivariate simulation with 4 variables

In [17]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
results ={}
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 06:51:20.252896
run_multiple_imputation iteration 0 stopped 2023-03-23 06:53:22.678045
run_multiple_imputation iteration 1 started 2023-03-23 06:53:22.816627
run_multiple_imputation iteration 1 stopped 2023-03-23 06:55:27.172851
run_multiple_imputation iteration 2 started 2023-03-23 06:55:27.300474
run_multiple_imputation iteration 2 stopped 2023-03-23 06:57:33.975478
run_multiple_imputation iteration 3 started 2023-03-23 06:57:34.111596
run_multiple_imputation iteration 3 stopped 2023-03-23 06:59:37.874674
run_multiple_imputation iteration 4 started 2023-03-23 06:59:37.987220
run_multiple_imputation iteration 4 stopped 2023-03-23 07:01:55.795682
run_multiple_imputation iteration 5 started 2023-0

run_multiple_imputation iteration 5 stopped 2023-03-23 08:23:55.382864
results are saving for the columns ['sealevelpressure', 'visibility', 'solarradiation', 'dist']  =  3.1043750942649466 0.0887099330133051
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 08:24:27.436074
run_multiple_imputation iteration 0 stopped 2023-03-23 08:26:31.696177
run_multiple_imputation iteration 1 started 2023-03-23 08:26:31.833211
run_multiple_imputation iteration 1 stopped 2023-03-23 08:28:35.766708
run_multiple_imputation iteration 2 started 2023-03-23 08:28:35.911748
run_multiple_imputation iteration 2 stopped 2023-03-23 08:30:57.586171
run_multiple_imputation iteration 3 started 2023-03-23 08:30:57.724749
run_multiple_imputation iteration 3 stopped 2023-03-23

In [18]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 3.1427257900856023,
  'r2': 0.06605514394722867},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2404121399613137,
  'r2': 0.525360268091523},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.2471885536093072,
  'r2': 0.5224847085112838},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.2451624930616707,
  'r2': 0.5233453739456508},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.2365238613383864,
  'r2': 0.5270063313425173},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2363479197668665, 'r2': 0.5270807468025972},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.1043750942649466, 'r2': 0.0887099330133051},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.1213601803443263, 'r2': 0.07871070071891362}}

### Multivaraite simulation with 5 variables 

In [19]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 08:37:59.151714
run_multiple_imputation iteration 0 stopped 2023-03-23 08:40:02.206898
run_multiple_imputation iteration 1 started 2023-03-23 08:40:02.348018
run_multiple_imputation iteration 1 stopped 2023-03-23 08:42:09.943263
run_multiple_imputation iteration 2 started 2023-03-23 08:42:10.058806
run_multiple_imputation iteration 2 stopped 2023-03-23 08:44:12.938135
run_multiple_imputation iteration 3 started 2023-03-23 08:44:13.056229
run_multiple_imputation iteration 3 stopped 2023-03-23 08:46:34.547072
run_multiple_imputation iteration 4 started 2023-03-23 08:46:34.665629
run_multiple_imputation iteration 4 stopped 2023-03-23 08:48:38.133246
run_multiple_imputation iteration 5 started 2023-0

In [20]:
results ={}
col_list = [
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 09:34:34.508574
run_multiple_imputation iteration 0 stopped 2023-03-23 09:36:40.832761
run_multiple_imputation iteration 1 started 2023-03-23 09:36:40.965822
run_multiple_imputation iteration 1 stopped 2023-03-23 09:38:48.425579
run_multiple_imputation iteration 2 started 2023-03-23 09:38:48.545597
run_multiple_imputation iteration 2 stopped 2023-03-23 09:40:56.000846
run_multiple_imputation iteration 3 started 2023-03-23 09:40:56.136325
run_multiple_imputation iteration 3 stopped 2023-03-23 09:43:03.570130
run_multiple_imputation iteration 4 started 2023-03-23 09:43:03.685763
run_multiple_imputation iteration 4 stopped 2023-03-23 09:45:15.763895
run_multiple_imputation iteration 5 started 2023-0

In [21]:
results

{('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.236620541547537, 'r2': 0.5269654374213837},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2378689431816867, 'r2': 0.5264372280820568},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.1051577912356545, 'r2': 0.0882503533106076},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.1441643559161903, 'r2': 0.06519993178638384}}

### Multivaraite simulation with 6 variables 

In [22]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 10:28:11.140781
run_multiple_imputation iteration 0 stopped 2023-03-23 10:30:21.674198
run_multiple_imputation iteration 1 started 2023-03-23 10:30:21.817825
run_multiple_imputation iteration 1 stopped 2023-03-23 10:32:35.551784
run_multiple_imputation iteration 2 started 2023-03-23 10:32:35.660823
run_multiple_imputation iteration 2 stopped 2023-03-23 10:34:39.801427
run_multiple_imputation iteration 3 started 2023-03-23 10:34:39.937574
run_multiple_imputation iteration 3 stopped 2023-03-23 10:36:44.905785
run_multiple_imputation iteration 4 started 2023-03-23 10:36:45.027314
run_multiple_imputation iteration 4 stopped 2023-03-23 10:38:50.331209
run_multiple_imputation iteration 5 started 2023-0

run_multiple_imputation iteration 3 stopped 2023-03-23 12:05:23.072035
run_multiple_imputation iteration 4 started 2023-03-23 12:05:23.775917
run_multiple_imputation iteration 4 stopped 2023-03-23 12:08:20.619170
run_multiple_imputation iteration 5 started 2023-03-23 12:08:20.852379
run_multiple_imputation iteration 5 stopped 2023-03-23 12:10:57.234673
results are saving for the columns ['solarradiation', 'dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat']  =  3.1463922172852814 0.0638747193865612


In [23]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 3.1334594566465443, 'r2': 0.07155450119206519},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.2515699280659627, 'r2': 0.5206208570182975},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.2454535055850617, 'r2': 0.5232218003016675},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236529376844049, 'r2': 0.5270039984342755},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.1042751702304954, 'r2': 0.08876859751856736},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.121058259408548, 'r2': 0.07888891986609148},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 3.1463922172852814, 'r2': 0.0638747193865612}}

### Multivaraite simulation with 7 variables 

In [24]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 12:12:18.945514
run_multiple_imputation iteration 0 stopped 2023-03-23 12:15:58.840516
run_multiple_imputation iteration 1 started 2023-03-23 12:15:59.511702
run_multiple_imputation iteration 1 stopped 2023-03-23 12:18:59.609736
run_multiple_imputation iteration 2 started 2023-03-23 12:19:00.175690
run_multiple_imputation iteration 2 stopped 2023-03-23 12:21:35.699133
run_multiple_imputation iteration 3 started 2023-03-23 12:21:35.936519
run_multiple_imputation iteration 3 stopped 2023-03-23 12:25:12.033566
run_multiple_imputation iteration 4 started 2023-03-23 12:25:12.250949
run_multiple_imputation iteration 4 stopped 2023-03-23 12:27:59.552067
run_multiple_imputation iteration 5 started 2023-0

run_multiple_imputation iteration 3 started 2023-03-23 14:24:59.497408
run_multiple_imputation iteration 3 stopped 2023-03-23 14:28:32.172153
run_multiple_imputation iteration 4 started 2023-03-23 14:28:32.787208
run_multiple_imputation iteration 4 stopped 2023-03-23 14:32:03.334068
run_multiple_imputation iteration 5 started 2023-03-23 14:32:04.050377
run_multiple_imputation iteration 5 stopped 2023-03-23 14:35:20.297658
results are saving for the columns ['solarradiation', 'dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lat']  =  3.1456525383561296 0.06431481121623328


In [25]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 3.1320271012903405, 'r2': 0.0724031223089715},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.251972860325396, 'r2': 0.5204492659856204},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.24551175967209, 'r2': 0.5231970617453401},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2381668397026675, 'r2': 0.5263111419748445},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.1051697340974385, 'r2': 0.08824333986930488},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.144418125536409, 'r2': 0.06504902782287525},
 ('solarradiation',
  'dist',
  'birthyear',
  'start

### with all the variables

In [26]:
results ={}
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']
multiple_predictions = []
X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
for i in range(6):
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=i
    )
    multiple_predictions.append(y_predict)
    del X_train_data
    del X_test_data
predictions_average = np.mean(multiple_predictions, axis=0)
rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
r2 = r2_score(y_test, predictions_average)
print("results are saving for the columns", cols, " = ", rmse, r2)
results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 14:37:18.977402
run_multiple_imputation iteration 0 stopped 2023-03-23 14:39:23.702743
run_multiple_imputation iteration 1 started 2023-03-23 14:39:23.849906
run_multiple_imputation iteration 1 stopped 2023-03-23 14:41:32.010120
run_multiple_imputation iteration 2 started 2023-03-23 14:41:32.128718
run_multiple_imputation iteration 2 stopped 2023-03-23 14:43:46.022888
run_multiple_imputation iteration 3 started 2023-03-23 14:43:46.175547
run_multiple_imputation iteration 3 stopped 2023-03-23 14:46:30.733143
run_multiple_imputation iteration 4 started 2023-03-23 14:46:30.886716
run_multiple_imputation iteration 4 stopped 2023-03-23 14:49:15.019550
run_multiple_imputation iteration 5 started 2023-0

In [27]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.1391584332611084, 'r2': 0.0681742111760607}}

### Test Ignore

In [28]:
# # Execute the simulation
# print("Executing MCAR Missingness...MI", datetime.datetime.now())

# X = df.drop('tripduration', axis=1)
# y = df['tripduration']
# cols = ['dist','birthyear']
# rmse, r2 = perform_simulation(X, y, cols, n_simulation=N_SIM)
# print(rmse, r2)
# print("stooped MCAR Missingness...MI", datetime.datetime.now())

In [29]:
# # Plot results
# n_situations = 3
# n = np.arange(n_situations)
# n_labels = ["Full Data", "Single Imputation", "Multiple Imputations"]
# colors = ["r", "orange", "green"]

# plt.figure(figsize=(12, 6))
# ax1 = plt.subplot(111)
# for j in n:
#     ax1.barh(
#         j,
#         mse_means[j],
#         xerr=mse_std[j],
#         color=colors[j],
#         alpha=0.6,
#         align="center",
#     )

# ax1.set_title("MCAR Missingness")
# ax1.set_yticks(n)
# ax1.set_xlabel("Mean Squared Error")
# ax1.invert_yaxis()
# ax1.set_yticklabels(n_labels)
# plt.show()