In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [2]:
#!pip install category_encoders

In [3]:
tripdata = pd.read_csv("/home/jovyan/data/final_bike_sharing.csv")

In [4]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,ss_3273,Manila & 1st,40.721651,-74.042884,es_3273,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,ss_3198,Heights Elevator,40.748716,-74.040443,es_3198,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,ss_3213,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717733,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,ss_3272,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [5]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [6]:
df = tripdata[selected_features]
df.shape

(315433, 22)

In [7]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [8]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [9]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=20, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=20, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [10]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [11]:
def model_evaluation(X_train,y_train):
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    return model_lr

In [12]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=True, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multivariate simulation with 2 variables

In [13]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 20:33:55.241156
run_multiple_imputation iteration 0 stopped 2023-03-22 20:36:10.851536
run_multiple_imputation iteration 1 started 2023-03-22 20:36:10.973124
run_multiple_imputation iteration 1 stopped 2023-03-22 20:38:21.512077
run_multiple_imputation iteration 2 started 2023-03-22 20:38:21.636617
run_multiple_imputation iteration 2 stopped 2023-03-22 20:40:33.523822
run_multiple_imputation iteration 3 started 2023-03-22 20:40:33.644347
run_multiple_imputation iteration 3 stopped 2023-03-22 20:42:47.821359
run_multiple_imputation iteration 4 started 2023-03-22 20:42:47.942842
run_multiple_imputation iteration 4 stopped 2023-03-22 20:45:06.390606
run_multiple_imputation iteration 5 started 2023-0

In [14]:
results

{('dist', 'birthyear'): {'rmse': 3.096983115701843, 'r2': 0.09304460034263395},
 ('start_lat', 'start_lon'): {'rmse': 2.238624202573561,
  'r2': 0.5261175283193869},
 ('end_lat', 'end_lon'): {'rmse': 2.2388995514852534,
  'r2': 0.5260009468517615},
 ('hour', 'temp'): {'rmse': 2.241816070535067, 'r2': 0.5247652257746525},
 ('feelslike', 'dew'): {'rmse': 2.236158064234106, 'r2': 0.5271610406824124},
 ('snowdepth', 'winddir'): {'rmse': 2.236302716020267,
  'r2': 0.5270998650317766},
 ('sealevelpressure', 'visibility'): {'rmse': 2.23599012781975,
  'r2': 0.5272320588263626},
 ('solarradiation', 'dist'): {'rmse': 3.099188137962204,
  'r2': 0.09175265349623496}}

### Multivariate simulation with 3 variables

In [15]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-22 22:34:47.995152
run_multiple_imputation iteration 0 stopped 2023-03-22 22:36:58.608265
run_multiple_imputation iteration 1 started 2023-03-22 22:36:58.727509
run_multiple_imputation iteration 1 stopped 2023-03-22 22:39:08.698548
run_multiple_imputation iteration 2 started 2023-03-22 22:39:08.818151
run_multiple_imputation iteration 2 stopped 2023-03-22 22:41:19.019332
run_multiple_imputation iteration 3 started 2023-03-22 22:41:19.138091
run_multiple_imputation iteration 3 stopped 2023-03-22 22:43:34.376710
run_multiple_imputation iteration 4 started 2023-03-22 22:43:34.495904
run_multiple_imputation iteration 4 stopped 2023-03-22 22:45:51.050211
run_multiple_imputation iteration 5 started 2023-0

In [16]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 3.1149938407421627,
  'r2': 0.08246499937568252},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.2393280473829966,
  'r2': 0.5258194951330875},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.2464847661738,
  'r2': 0.5227837636447044},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.244297604830626,
  'r2': 0.5237125398616653},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.2364505751986923,
  'r2': 0.5270373288317406},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.236376552955188,
  'r2': 0.5270686366361912},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2373793987758828, 'r2': 0.5266443936084751},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 3.099022432871225,
  'r2': 0.09184977388973203}}

### Multivariate simulation with 4 variables

In [17]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
results ={}
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 00:42:14.674443
run_multiple_imputation iteration 0 stopped 2023-03-23 00:44:25.520394
run_multiple_imputation iteration 1 started 2023-03-23 00:44:25.637734
run_multiple_imputation iteration 1 stopped 2023-03-23 00:46:40.247822
run_multiple_imputation iteration 2 started 2023-03-23 00:46:40.376665
run_multiple_imputation iteration 2 stopped 2023-03-23 00:49:10.677320
run_multiple_imputation iteration 3 started 2023-03-23 00:49:10.796458
run_multiple_imputation iteration 3 stopped 2023-03-23 00:51:33.091009
run_multiple_imputation iteration 4 started 2023-03-23 00:51:33.209636
run_multiple_imputation iteration 4 stopped 2023-03-23 00:54:00.105313
run_multiple_imputation iteration 5 started 2023-0

In [18]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 3.1377533936227695,
  'r2': 0.06900816664308129},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2402332775783376,
  'r2': 0.525436050402468},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.2470386527500525,
  'r2': 0.5225484126138897},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.2450401147561734,
  'r2': 0.523397335080626},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.2365159329857196,
  'r2': 0.5270096848090573},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2363076425573363, 'r2': 0.5270977814474797},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.1002631509276117, 'r2': 0.09112245812409414},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.1163109859482407, 'r2': 0.08168889361351528}}

### Multivaraite simulation with 5 variables 

In [19]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 03:49:41.159716
run_multiple_imputation iteration 0 stopped 2023-03-23 03:52:46.192645
run_multiple_imputation iteration 1 started 2023-03-23 03:52:46.317708
run_multiple_imputation iteration 1 stopped 2023-03-23 03:55:53.956596
run_multiple_imputation iteration 2 started 2023-03-23 03:55:54.080899
run_multiple_imputation iteration 2 stopped 2023-03-23 03:59:02.320981
run_multiple_imputation iteration 3 started 2023-03-23 03:59:02.445558
run_multiple_imputation iteration 3 stopped 2023-03-23 04:02:11.653384
run_multiple_imputation iteration 4 started 2023-03-23 04:02:11.777818
run_multiple_imputation iteration 4 stopped 2023-03-23 04:05:17.359547
run_multiple_imputation iteration 5 started 2023-0

In [20]:
results ={}
col_list = [
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 05:19:51.884908
run_multiple_imputation iteration 0 stopped 2023-03-23 05:23:10.460887
run_multiple_imputation iteration 1 started 2023-03-23 05:23:10.586636
run_multiple_imputation iteration 1 stopped 2023-03-23 05:26:18.586651
run_multiple_imputation iteration 2 started 2023-03-23 05:26:18.711669
run_multiple_imputation iteration 2 stopped 2023-03-23 05:29:27.983733
run_multiple_imputation iteration 3 started 2023-03-23 05:29:28.202364
run_multiple_imputation iteration 3 stopped 2023-03-23 05:32:43.428761
run_multiple_imputation iteration 4 started 2023-03-23 05:32:43.673145
run_multiple_imputation iteration 4 stopped 2023-03-23 05:35:55.841545
run_multiple_imputation iteration 5 started 2023-0

In [21]:
results

{('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2365499399021296, 'r2': 0.526995300771669},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2377574042382107, 'r2': 0.5264844331474621},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.101096935392109, 'r2': 0.09063352551415038},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.1396471744544163, 'r2': 0.06788403333994708}}

### Multivaraite simulation with 6 variables 

In [22]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 06:51:02.495002
run_multiple_imputation iteration 0 stopped 2023-03-23 06:54:17.972746
run_multiple_imputation iteration 1 started 2023-03-23 06:54:18.098092
run_multiple_imputation iteration 1 stopped 2023-03-23 06:57:30.616960
run_multiple_imputation iteration 2 started 2023-03-23 06:57:30.748233
run_multiple_imputation iteration 2 stopped 2023-03-23 07:00:43.940734
run_multiple_imputation iteration 3 started 2023-03-23 07:00:44.063871
run_multiple_imputation iteration 3 stopped 2023-03-23 07:04:00.607832
run_multiple_imputation iteration 4 started 2023-03-23 07:04:00.732381
run_multiple_imputation iteration 4 stopped 2023-03-23 07:07:11.763557
run_multiple_imputation iteration 5 started 2023-0

In [23]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 3.105471984800699, 'r2': 0.08806583427220627},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.2515234375457576, 'r2': 0.5206406532994327},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.245298074327244, 'r2': 0.5232878036014572},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236520393322277, 'r2': 0.5270077982156585},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 3.101444262864672, 'r2': 0.09042981330939315},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 3.1166038496359816, 'r2': 0.08151628400202249},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 3.1378650452881294, 'r2': 0.06894190991415694}}

### Multivaraite simulation with 7 variables 

In [24]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    multiple_predictions = []
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    for i in range(6):
        X_train_data = X_train_encoded.copy()
        X_test_data = X_test_encoded.copy()
        y_predict = run_multiple_imputation(
            X_train_data, X_test_data, y_train, y_test, random_state=i
        )
        multiple_predictions.append(y_predict)
        del X_train_data
        del X_test_data
    predictions_average = np.mean(multiple_predictions, axis=0)
    rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
    r2 = r2_score(y_test, predictions_average)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 09:46:10.165090
run_multiple_imputation iteration 0 stopped 2023-03-23 09:49:22.334563
run_multiple_imputation iteration 1 started 2023-03-23 09:49:22.459651
run_multiple_imputation iteration 1 stopped 2023-03-23 09:52:39.195847
run_multiple_imputation iteration 2 started 2023-03-23 09:52:39.318210
run_multiple_imputation iteration 2 stopped 2023-03-23 09:55:50.566190
run_multiple_imputation iteration 3 started 2023-03-23 09:55:50.695046
run_multiple_imputation iteration 3 stopped 2023-03-23 09:59:05.747334
run_multiple_imputation iteration 4 started 2023-03-23 09:59:05.871300
run_multiple_imputation iteration 4 stopped 2023-03-23 10:02:13.134438
run_multiple_imputation iteration 5 started 2023-0

In [25]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 3.106222611791403, 'r2': 0.08762493180699926},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.2515473398313213, 'r2': 0.520630475439749},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2453475444830606, 'r2': 0.5232667967812894},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2381030588305126, 'r2': 0.5263381389417195},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.1014923834843917, 'r2': 0.09040158812338117},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 3.138370167665142, 'r2': 0.06864212893423183},
 ('solarradiation',
  'dist',
  'birthyear',
  'sta

### with all the variables

In [26]:
results ={}
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']
multiple_predictions = []
X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
for i in range(6):
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=i
    )
    multiple_predictions.append(y_predict)
    del X_train_data
    del X_test_data
predictions_average = np.mean(multiple_predictions, axis=0)
rmse = np.sqrt(mean_squared_error(y_test, predictions_average))
r2 = r2_score(y_test, predictions_average)
print("results are saving for the columns", cols, " = ", rmse, r2)
results[tuple(cols)] ={"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 0 started 2023-03-23 12:26:18.248213
run_multiple_imputation iteration 0 stopped 2023-03-23 12:29:17.312203
run_multiple_imputation iteration 1 started 2023-03-23 12:29:17.419541
run_multiple_imputation iteration 1 stopped 2023-03-23 12:32:20.012562
run_multiple_imputation iteration 2 started 2023-03-23 12:32:20.119471
run_multiple_imputation iteration 2 stopped 2023-03-23 12:35:13.917004
run_multiple_imputation iteration 3 started 2023-03-23 12:35:14.024494
run_multiple_imputation iteration 3 stopped 2023-03-23 12:38:12.326600
run_multiple_imputation iteration 4 started 2023-03-23 12:38:12.565689
run_multiple_imputation iteration 4 stopped 2023-03-23 12:41:13.331680
run_multiple_imputation iteration 5 started 2023-0

In [27]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 3.113171337718016, 'r2': 0.08353833769683394}}

### Test Ignore

In [28]:
# # Execute the simulation
# print("Executing MCAR Missingness...MI", datetime.datetime.now())

# X = df.drop('tripduration', axis=1)
# y = df['tripduration']
# cols = ['dist','birthyear']
# rmse, r2 = perform_simulation(X, y, cols, n_simulation=N_SIM)
# print(rmse, r2)
# print("stooped MCAR Missingness...MI", datetime.datetime.now())

In [29]:
# # Plot results
# n_situations = 3
# n = np.arange(n_situations)
# n_labels = ["Full Data", "Single Imputation", "Multiple Imputations"]
# colors = ["r", "orange", "green"]

# plt.figure(figsize=(12, 6))
# ax1 = plt.subplot(111)
# for j in n:
#     ax1.barh(
#         j,
#         mse_means[j],
#         xerr=mse_std[j],
#         color=colors[j],
#         alpha=0.6,
#         align="center",
#     )

# ax1.set_title("MCAR Missingness")
# ax1.set_yticks(n)
# ax1.set_xlabel("Mean Squared Error")
# ax1.invert_yaxis()
# ax1.set_yticklabels(n_labels)
# plt.show()