In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [2]:
#!pip install category_encoders

In [3]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [4]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,others,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [5]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [6]:
df = tripdata[selected_features]

In [7]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [8]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [9]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=40, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=40, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [10]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [11]:
def model_evaluation(X_train,y_train):
    print(datetime.datetime.now())
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    print(datetime.datetime.now())
    return model_lr

In [12]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=False, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multiple Imputation simulation with 2 variables

In [13]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 16:03:11.559267
2023-03-30 16:04:01.822357
2023-03-30 16:04:02.339844
run_multiple_imputation iteration 1200 stopped 2023-03-30 16:04:02.355487
results are saving for the columns ['dist', 'birthyear']  =  2.61787885649224 0.35195174358677894
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 16:05:06.980681
2023-03-30 16:05:57.808379
2023-03-30 16:05:58.225116
run_multiple_imputation iteration 1200 stopped 2023-03-30 16:05:58.237136
results are savi

In [14]:
results

{('dist', 'birthyear'): {'rmse': 2.61787885649224, 'r2': 0.35195174358677894},
 ('start_lat', 'start_lon'): {'rmse': 2.2373393426952988,
  'r2': 0.5266613425448928},
 ('end_lat', 'end_lon'): {'rmse': 2.2372943344190324,
  'r2': 0.5266803865427069},
 ('hour', 'temp'): {'rmse': 2.237663926724994, 'r2': 0.5265239925144227},
 ('feelslike', 'dew'): {'rmse': 2.2360412119858046, 'r2': 0.5272104565557693},
 ('snowdepth', 'winddir'): {'rmse': 2.2360111413274724,
  'r2': 0.5272231727768756},
 ('sealevelpressure', 'visibility'): {'rmse': 2.236041316935636,
  'r2': 0.5272104121744907},
 ('solarradiation', 'dist'): {'rmse': 2.623250453600743,
  'r2': 0.34928956882631357}}

### Multivariate simulation with 3 variables

In [15]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 16:12:56.882000
2023-03-30 16:13:36.912353
2023-03-30 16:13:37.445536
run_multiple_imputation iteration 1200 stopped 2023-03-30 16:13:37.456956
results are saving for the columns ['dist', 'birthyear', 'start_lat']  =  2.6227958755045955 0.34951507003939175
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 16:14:24.131677
2023-03-30 16:14:52.835698
2023-03-30 16:14:53.217973
run_multiple_imputation iteration 1200 stopped 2023-03-30 16:14:53.227000
r

In [16]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 2.6227958755045955,
  'r2': 0.34951507003939175},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.237553089001659,
  'r2': 0.5265708965221079},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.2394734518588297,
  'r2': 0.5257579139759077},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2384099377622326,
  'r2': 0.5262082371387682},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.236181719447116,
  'r2': 0.5271510367693244},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.236040339365005,
  'r2': 0.5272108255703534},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.23647647045239, 'r2': 0.5270263761557455},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 2.6250797108289965,
  'r2': 0.34838173955160245}}

### Multivariate simulation with 4 variables

In [17]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 16:23:22.090447
2023-03-30 16:23:51.718513
2023-03-30 16:23:52.186941
run_multiple_imputation iteration 1200 stopped 2023-03-30 16:23:52.201587
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon']  =  2.6292821560364543 0.3462937405127222
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 16:24:32.689793
2023-03-30 16:25:09.766410
2023-03-30 16:25:10.142165
run_multiple_imputation iteration 1200 stopped 2023-03-30 16:25

In [18]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 2.6292821560364543,
  'r2': 0.3462937405127222},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2380487564676668,
  'r2': 0.5263611232686605},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.2396166723501767,
  'r2': 0.5256972538627951},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.2385943136736106,
  'r2': 0.5261301822799883},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.236220276919026,
  'r2': 0.5271347303863139},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2360026798935686, 'r2': 0.527226750900644},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.6247753521179824, 'r2': 0.3485328314942979},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.630036848127571, 'r2': 0.3459184154481464}}

### Multivaraite simulation with 5 variables 

In [19]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 16:43:00.790226
2023-03-30 16:43:40.809718
2023-03-30 16:43:41.337600
run_multiple_imputation iteration 1200 stopped 2023-03-30 16:43:41.353598
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat']  =  2.6306895111361177 0.34559374484616334
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 16:45:43.618309
2023-03-30 16:46:24.628325
2023-03-30 16:46:25.058986
run_multiple_imputation iteration 1200 stopped 2023

In [20]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.6306895111361177, 'r2': 0.34559374484616334},
 ('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour'): {'rmse': 2.241391420936778, 'r2': 0.5249452486990521},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.2403467165186273, 'r2': 0.5253879880912071},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.2388147860814978, 'r2': 0.5260368376629451},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.2362574173647234, 'r2': 0.5271190230135715},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2365756615351176, 'r2': 0.5269844210474843},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.626477543682756, 'r2': 0.3476875923074678},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.641429775271937, '

### Multivaraite simulation with 6 variables 

In [21]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 17:05:55.808783
2023-03-30 17:06:36.050972
2023-03-30 17:06:36.450702
run_multiple_imputation iteration 1200 stopped 2023-03-30 17:06:36.464697
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lon']  =  2.633479058987585 0.34420516179835203
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 17:07:30.583785
2023-03-30 17:07:54.030149
2023-03-30 17:07:54.326950
run_multiple_imputation iteration 1200 st

In [22]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 2.633479058987585, 'r2': 0.34420516179835203},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.2405857446813777, 'r2': 0.5252867076168164},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.238683844351604, 'r2': 0.5260922774803597},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236248851351272, 'r2': 0.5271226457602374},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.620772021880319, 'r2': 0.3505185628149734},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.6333328975506802, 'r2': 0.3442779546733039},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.6471251764384554, 'r2': 0.3373911798285787}}

### Multivaraite simulation with 7 variables 

In [23]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 17:18:34.914115
2023-03-30 17:18:58.361646
2023-03-30 17:18:58.682386
run_multiple_imputation iteration 1200 stopped 2023-03-30 17:18:58.690386
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lat', 'end_lon']  =  2.6339351200927568 0.34397800342020723
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 17:20:07.419765
2023-03-30 17:20:33.493552
2023-03-30 17:20:35.441439
run_multiple_imputation itera

In [24]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 2.6339351200927568, 'r2': 0.34397800342020723},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.2407766398684976, 'r2': 0.5252058141777944},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.238685360032414, 'r2': 0.5260916357702},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2372542722861444, 'r2': 0.5266973373955184},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.621883610219765, 'r2': 0.3499674968745703},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.642450413496036, 'r2': 0.33972941762216835},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_l

### with all the variables

In [25]:
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']

X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
X_train_data = X_train_encoded.copy()
X_test_data = X_test_encoded.copy()
y_predict = run_multiple_imputation(
    X_train_data, X_test_data, y_train, y_test, random_state=1200
)
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
r2 = r2_score(y_test, y_predict)


Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 100938 for the train
Train set after Amputation (252346, 21)
number of missing values produced 25235 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 17:48:22.040169
2023-03-30 17:54:04.061529
2023-03-30 17:54:04.526676
run_multiple_imputation iteration 1200 stopped 2023-03-30 17:54:04.537201


In [26]:
print("results are saving for the columns", cols, " = ", rmse, r2)   

results are saving for the columns ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour', 'temp', 'feelslike', 'dew', 'snowdepth', 'winddir', 'sealevelpressure', 'visibility', 'solarradiation', 'dist', 'birthyear']  =  2.713135174432311 0.3039328807338021
