In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [2]:
#!pip install category_encoders

In [3]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [4]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,others,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [5]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [6]:
df = tripdata[selected_features]

In [7]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [8]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [9]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=20, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=20, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [10]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [11]:
def model_evaluation(X_train,y_train):
    print(datetime.datetime.now())
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    print(datetime.datetime.now())
    return model_lr

In [12]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=False, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multiple Imputation simulation with 2 variables

In [13]:
results ={}
col_list = [['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','birthyear']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1100
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1100 started 2023-03-30 17:43:42.052471
2023-03-30 17:44:40.845066
2023-03-30 17:44:41.391732
run_multiple_imputation iteration 1100 stopped 2023-03-30 17:44:41.402751
results are saving for the columns ['start_lat', 'start_lon']  =  2.236758181468611 0.52690721525807
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1100 started 2023-03-30 17:45:11.318104
2023-03-30 17:45:59.826514
2023-03-30 17:46:00.417222
run_multiple_imputation iteration 1100 stopped 2023-03-30 17:46:00.433255
results are sav

In [14]:
results

{('start_lat', 'start_lon'): {'rmse': 2.236758181468611,
  'r2': 0.52690721525807},
 ('end_lat', 'end_lon'): {'rmse': 2.2366232683274214,
  'r2': 0.5269642840188354},
 ('hour', 'temp'): {'rmse': 2.236782222409824, 'r2': 0.5268970454891224},
 ('feelslike', 'dew'): {'rmse': 2.23599651205664, 'r2': 0.5272293591124402},
 ('snowdepth', 'winddir'): {'rmse': 2.235941755881793,
  'r2': 0.5272525137057207},
 ('sealevelpressure', 'visibility'): {'rmse': 2.235938327227267,
  'r2': 0.5272539635525797},
 ('solarradiation', 'birthyear'): {'rmse': 2.2371866180769304,
  'r2': 0.5267259621554308}}

### Multivariate simulation with 3 variables

In [15]:
results ={}
col_list = [['start_lat','start_lon','end_lat'],['end_lat','end_lon','hour'], ['hour','temp','feelslike'], 
            ['feelslike', 'dew','snowdepth'], ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
             ['solarradiation','birthyear','start_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 17:55:59.798962
2023-03-30 17:56:32.376902
2023-03-30 17:56:32.759749
run_multiple_imputation iteration 1200 stopped 2023-03-30 17:56:32.772764
results are saving for the columns ['start_lat', 'start_lon', 'end_lat']  =  2.236753339386766 0.5269092635362642
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 17:56:48.138363
2023-03-30 17:57:14.849896
2023-03-30 17:57:15.155695
run_multiple_imputation iteration 1200 stopped 2023-03-30 17:57:15.162699
re

In [16]:
results

{('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.236753339386766,
  'r2': 0.5269092635362642},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.237871276723342,
  'r2': 0.5264362404647662},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2368682480393436,
  'r2': 0.5268606541393397},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.236284507260454,
  'r2': 0.5271075660374187},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.2359514264779206,
  'r2': 0.5272484243685234},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2362137945866327, 'r2': 0.5271374718563235},
 ('solarradiation', 'birthyear', 'start_lat'): {'rmse': 2.2373087284116315,
  'r2': 0.5266742961675386}}

### Multivariate simulation with 4 variables

In [17]:
results ={}
col_list = [['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','birthyear'],
            ['solarradiation','birthyear','start_lat','start_lon']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1300
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1300 started 2023-03-30 18:02:31.057597
2023-03-30 18:03:04.438814
2023-03-30 18:03:04.830641
run_multiple_imputation iteration 1300 stopped 2023-03-30 18:03:04.840238
results are saving for the columns ['start_lat', 'start_lon', 'end_lat', 'end_lon']  =  2.2372395581017943 0.5267035630979927
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1300 started 2023-03-30 18:03:45.367630
2023-03-30 18:04:32.241971
2023-03-30 18:04:32.747127
run_multiple_imputation iteration 1300 stopped 2023-03-30 18:04:

In [18]:
results

{('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2372395581017943,
  'r2': 0.5267035630979927},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.237940944591823,
  'r2': 0.5264067546836989},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.2368324403158644,
  'r2': 0.526875802020557},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.236324062243767,
  'r2': 0.5270908370231084},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2359134735435466, 'r2': 0.5272644731583466},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear'): {'rmse': 2.2368778697493488, 'r2': 0.5268565837884169},
 ('solarradiation',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.237601272511671, 'r2': 0.526550506640258}}

### Multivaraite simulation with 5 variables 

In [19]:
results ={}
col_list = [ ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth'],['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','birthyear','start_lat'],
            ['solarradiation','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1400
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1400 started 2023-03-30 18:13:58.632884
2023-03-30 18:14:36.789431
2023-03-30 18:14:37.352743
run_multiple_imputation iteration 1400 stopped 2023-03-30 18:14:37.366778
results are saving for the columns ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour']  =  2.238503482679494 0.5261686360481823
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1400 started 2023-03-30 18:15:23.281209
2023-03-30 18:15:59.867819
2023-03-30 18:16:00.262418
run_multiple_imputation iteration 1400 stopped 2023-03-30

In [20]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour'): {'rmse': 2.238503482679494, 'r2': 0.5261686360481823},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.238289734097597, 'r2': 0.5262591214527162},
 ('hour', 'temp', 'feelslike', 'dew', 'snowdepth'): {'rmse': 2.236867438684472,
  'r2': 0.526860996526573},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.236326563899253, 'r2': 0.52708977898641},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2362517125116566, 'r2': 0.527121435717073},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear',
  'start_lat'): {'rmse': 2.2369824989666283, 'r2': 0.5268123205018514},
 ('solarradiation',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.237652098153505, 'r2': 0.5265289982080243}}

### Multivaraite simulation with 6 variables 

In [21]:
results ={}
col_list = [ ['start_lat','start_lon','end_lat','end_lon','hour','temp'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','birthyear'],
            ['sealevelpressure','visibility','solarradiation','birthyear','start_lat','start_lon'],
            ['solarradiation','birthyear','start_lat','start_lon','end_lat','hour']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1500
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1500 started 2023-03-30 18:27:32.017990
2023-03-30 18:28:10.298924
2023-03-30 18:28:10.706236
run_multiple_imputation iteration 1500 stopped 2023-03-30 18:28:10.717254
results are saving for the columns ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour', 'temp']  =  2.238564953738388 0.526142612130124
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1500 started 2023-03-30 18:29:18.218118
2023-03-30 18:29:58.698009
2023-03-30 18:29:59.091329
run_multiple_imputation iteration 1500 stopped 202

In [22]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp'): {'rmse': 2.238564953738388, 'r2': 0.526142612130124},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.238350818356413, 'r2': 0.526233263758902},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.2370180810633915, 'r2': 0.5267972670593556},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2363200801498833, 'r2': 0.5270925211860082},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear'): {'rmse': 2.2366057108912445, 'r2': 0.5269717106265366},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.237746445112981, 'r2': 0.5264890710976513},
 ('solarradiation',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'hour'): {'rmse': 2.238784079661755, 'r2': 0.5260498388384692}}

### Multivaraite simulation with 7 variables 

In [23]:
results ={}
col_list = [['start_lat','start_lon','end_lat','end_lon','hour','temp','feelslike'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','birthyear','start_lat'],
            ['sealevelpressure','visibility','solarradiation','birthyear','start_lat','start_lon','end_lat'],
            ['solarradiation','birthyear','start_lat','start_lon','end_lat','hour']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1600
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1600 started 2023-03-30 18:43:10.090324
2023-03-30 18:43:46.019433
2023-03-30 18:43:46.400585
run_multiple_imputation iteration 1600 stopped 2023-03-30 18:43:46.411660
results are saving for the columns ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour', 'temp', 'feelslike']  =  2.238938937245935 0.5259842699289713
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1600 started 2023-03-30 18:44:38.701376
2023-03-30 18:45:29.127201
2023-03-30 18:45:29.720939
run_multiple_imputation iteration 16

In [24]:
results

{('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.238938937245935, 'r2': 0.5259842699289713},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.238299165981353, 'r2': 0.5262551288701267},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2369975426034325, 'r2': 0.5268059561359885},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.23672306768847, 'r2': 0.5269220688491063},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear',
  'start_lat'): {'rmse': 2.236603085656157, 'r2': 0.5269728210678877},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.2380049592060582, 'r2': 0.526379660740427},
 ('solarradiation',
  'birthyear',
  'start_lat',
  

### with all the variables

In [25]:
col_list = ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour', 'temp', 'feelslike', 'dew', 'snowdepth', 'winddir',
            'sealevelpressure', 'visibility', 'solarradiation', 'birthyear']

X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
X_train_data = X_train_encoded.copy()
X_test_data = X_test_encoded.copy()
y_predict = run_multiple_imputation(
    X_train_data, X_test_data, y_train, y_test, random_state=1700
)
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
r2 = r2_score(y_test, y_predict)


Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 50469 for the train
Train set after Amputation (252346, 21)
number of missing values produced 12617 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1700 started 2023-03-30 18:57:04.591729
2023-03-30 18:57:57.985162
2023-03-30 18:57:58.363262
run_multiple_imputation iteration 1700 stopped 2023-03-30 18:57:58.377568


In [26]:
print("results are saving for the columns", cols, " = ", rmse, r2)   

results are saving for the columns ['solarradiation', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'hour']  =  2.238784079661755 0.5260498388384692
