In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [2]:
#!pip install category_encoders

In [3]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [4]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,others,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [5]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [6]:
df = tripdata[selected_features]

In [7]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [8]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [9]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=30, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=30, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [10]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [11]:
def model_evaluation(X_train,y_train):
    print(datetime.datetime.now())
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    print(datetime.datetime.now())
    return model_lr

In [12]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=False, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multiple Imputation simulation with 2 variables

In [13]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:40:14.974736
2023-03-30 12:40:52.338908
2023-03-30 12:40:52.675945
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:40:52.680704
results are saving for the columns ['dist', 'birthyear']  =  2.532596636071428 0.39348673066220463
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:43:18.588190
2023-03-30 12:43:52.694903
2023-03-30 12:43:52.997013
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:43:53.001591
results are savin

In [14]:
results

{('dist', 'birthyear'): {'rmse': 2.532596636071428, 'r2': 0.39348673066220463},
 ('start_lat', 'start_lon'): {'rmse': 2.237093883435516,
  'r2': 0.5267651971376235},
 ('end_lat', 'end_lon'): {'rmse': 2.23708603415367, 'r2': 0.5267685180056997},
 ('hour', 'temp'): {'rmse': 2.237034844345162, 'r2': 0.5267901750644045},
 ('feelslike', 'dew'): {'rmse': 2.2360098922327074, 'r2': 0.5272237009879241},
 ('snowdepth', 'winddir'): {'rmse': 2.2361014725141555,
  'r2': 0.5271849731868232},
 ('sealevelpressure', 'visibility'): {'rmse': 2.236007002139532,
  'r2': 0.5272249231351758},
 ('solarradiation', 'dist'): {'rmse': 2.5327300001745385,
  'r2': 0.39342285216947015}}

### Multivariate simulation with 3 variables

In [15]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:06:19.590299
2023-03-30 13:06:54.317925
2023-03-30 13:06:54.651091
run_multiple_imputation iteration 1200 stopped 2023-03-30 13:06:54.655793
results are saving for the columns ['dist', 'birthyear', 'start_lat']  =  2.5356649317077187 0.3920162325267966
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:10:34.171599
2023-03-30 13:11:08.753320
2023-03-30 13:11:09.135557
run_multiple_imputation iteration 1200 stopped 2023-03-30 13:11:09.139997
resu

In [16]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 2.5356649317077187,
  'r2': 0.3920162325267966},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.237045179915619,
  'r2': 0.5267858023969001},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.2390282180355308,
  'r2': 0.525946465108347},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2372973026379137,
  'r2': 0.526679130635405},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.2361891447057483,
  'r2': 0.5271478965668581},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.2361125491905263,
  'r2': 0.5271802889342783},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.236330719624364, 'r2': 0.5270880213837967},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 2.533822628835734,
  'r2': 0.3928993802350963}}

### Multivariate simulation with 4 variables

In [17]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:42:21.580813
2023-03-30 13:42:55.519880
2023-03-30 13:42:55.861153
run_multiple_imputation iteration 1200 stopped 2023-03-30 13:42:55.868500
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon']  =  2.5403839616085535 0.3897511356500948
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:47:47.605999
2023-03-30 13:48:22.304134
2023-03-30 13:48:22.655260
run_multiple_imputation iteration 1200 stopped 2023-03-30 13:48:2

In [18]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 2.5403839616085535,
  'r2': 0.3897511356500948},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.23728761246772,
  'r2': 0.5266832307161966},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.239243378927917,
  'r2': 0.5258553517569773},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.237319487171379,
  'r2': 0.5266697439037322},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.2362423057481466,
  'r2': 0.5271254140236676},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236160127159694, 'r2': 0.5271601682646226},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.5346271572772903, 'r2': 0.39251379108455586},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.5365841630859105, 'r2': 0.3915753390539095}}

### Multivaraite simulation with 5 variables 

In [19]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 14:28:25.982992
2023-03-30 14:29:01.330200
2023-03-30 14:29:01.688482
run_multiple_imputation iteration 1200 stopped 2023-03-30 14:29:01.693079
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat']  =  2.5410349610728495 0.3894383305054735
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 14:35:04.279842
2023-03-30 14:35:36.590607
2023-03-30 14:35:36.933982
run_multiple_imputation iteration 1200 stopped 2023-03

In [20]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.5410349610728495, 'r2': 0.3894383305054735},
 ('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour'): {'rmse': 2.2398575955390334, 'r2': 0.5255952036523921},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.239674332257378, 'r2': 0.5256728312866358},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.2373815335530023, 'r2': 0.5266434903113405},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.236248834461607, 'r2': 0.5271226529032171},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2366582379261613, 'r2': 0.5269494920789252},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.535486982660035, 'r2': 0.3921015642555815},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.5400262355605574, 'r

### Multivaraite simulation with 6 variables 

In [13]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 18:06:31.777993
2023-03-30 18:07:18.424113
2023-03-30 18:07:18.921969
run_multiple_imputation iteration 1200 stopped 2023-03-30 18:07:18.934567
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lon']  =  2.5424202878086075 0.3887724144653445
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 18:09:06.134439
2023-03-30 18:09:43.842411
2023-03-30 18:09:44.210676
run_multiple_imputation iteration 1200 stop

In [14]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 2.5424202878086075, 'r2': 0.3887724144653445},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.2397631675160894, 'r2': 0.5256352027691253},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.237716200332265, 'r2': 0.526501870704561},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.236179740557537, 'r2': 0.5271518736561155},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.5300344401679826, 'r2': 0.39471331345708915},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.5372891407316125, 'r2': 0.3912371004133858},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.5418901886410326, 'r2': 0.3890272719783876}}

### Multivaraite simulation with 7 variables 

In [15]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 18:26:29.604670
2023-03-30 18:27:05.359108
2023-03-30 18:27:05.770571
run_multiple_imputation iteration 1200 stopped 2023-03-30 18:27:05.783585
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lat', 'end_lon']  =  2.5421671333861804 0.38889413097577175
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 18:28:58.548617
2023-03-30 18:29:39.407182
2023-03-30 18:29:39.899739
run_multiple_imputation iterati

In [16]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 2.5421671333861804, 'r2': 0.38889413097577175},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.2397416939109416, 'r2': 0.5256442986178909},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.237483076985794, 'r2': 0.5266005228201469},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2366292569381865, 'r2': 0.5269617508872046},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.5311140783269326, 'r2': 0.3941966169053762},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.542238043539028, 'r2': 0.38886003863434027},
 ('solarradiation',
  'dist',
  'birthyear',
  'sta

### with all the variables

In [17]:
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']

X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
X_train_data = X_train_encoded.copy()
X_test_data = X_test_encoded.copy()
y_predict = run_multiple_imputation(
    X_train_data, X_test_data, y_train, y_test, random_state=1200
)
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
r2 = r2_score(y_test, y_predict)


Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 75704 for the train
Train set after Amputation (252346, 21)
number of missing values produced 18926 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 18:50:16.125530
2023-03-30 18:53:19.364237
2023-03-30 18:53:19.779181
run_multiple_imputation iteration 1200 stopped 2023-03-30 18:53:19.790204


In [18]:
print("results are saving for the columns", cols, " = ", rmse, r2)   

results are saving for the columns ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour', 'temp', 'feelslike', 'dew', 'snowdepth', 'winddir', 'sealevelpressure', 'visibility', 'solarradiation', 'dist', 'birthyear']  =  2.5860638570536376 0.3676074505016893
