In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [2]:
#!pip install category_encoders

In [3]:
tripdata = pd.read_csv("../../dataset/final_bike_sharing.csv")

In [4]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,others,Manila & 1st,40.721651,-74.042884,others,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,others,Heights Elevator,40.748716,-74.040443,others,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,others,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717732,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,others,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [5]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [6]:
df = tripdata[selected_features]

In [7]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [8]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [9]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=1, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=1, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [10]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [11]:
def model_evaluation(X_train,y_train):
    print(datetime.datetime.now())
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    print(datetime.datetime.now())
    return model_lr

In [12]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=False, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multiple Imputation simulation with 2 variables

In [13]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:34:13.731052
2023-03-30 12:35:16.320052
2023-03-30 12:35:17.186645
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:35:17.202888
results are saving for the columns ['dist', 'birthyear']  =  2.248695709534565 0.5218439689146894
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:35:24.381371
2023-03-30 12:36:22.419788
2023-03-30 12:36:22.918028
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:36:22.931523
results are saving for t

In [14]:
results

{('dist', 'birthyear'): {'rmse': 2.248695709534565, 'r2': 0.5218439689146894},
 ('start_lat', 'start_lon'): {'rmse': 2.2359419487094407,
  'r2': 0.5272524321662178},
 ('end_lat', 'end_lon'): {'rmse': 2.235950965104186, 'r2': 0.5272486194668022},
 ('hour', 'temp'): {'rmse': 2.236024663083339, 'r2': 0.5272174547443951},
 ('feelslike', 'dew'): {'rmse': 2.2359823840888984, 'r2': 0.5272353334216364},
 ('snowdepth', 'winddir'): {'rmse': 2.2360079081863224,
  'r2': 0.5272245399910915},
 ('sealevelpressure', 'visibility'): {'rmse': 2.235968673616763,
  'r2': 0.5272411311479166},
 ('solarradiation', 'dist'): {'rmse': 2.243997945357007,
  'r2': 0.5238397195378406}}

### Multivariate simulation with 3 variables

In [15]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:41:51.552650
2023-03-30 12:42:37.473118
2023-03-30 12:42:37.935724
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:42:37.945933
results are saving for the columns ['dist', 'birthyear', 'start_lat']  =  2.2486934530162555 0.5218449285530193
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:42:42.785614
2023-03-30 12:43:25.223863
2023-03-30 12:43:26.214287
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:43:26.226810
results ar

In [16]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 2.2486934530162555,
  'r2': 0.5218449285530193},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.2358922074551693,
  'r2': 0.5272734656241131},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.23602896292287,
  'r2': 0.5272156364361573},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.2360334644730817,
  'r2': 0.5272137328251312},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.2359800496491786,
  'r2': 0.5272363205850841},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.236004588670733,
  'r2': 0.5272259437288325},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2359747226280904, 'r2': 0.5272385732161757},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 2.2439845618587895,
  'r2': 0.523845399285698}}

### Multivariate simulation with 4 variables

In [17]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:50:06.079062
2023-03-30 12:50:58.022038
2023-03-30 12:50:58.523562
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:50:58.534972
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon']  =  2.248669242740133 0.5218552244908461
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:51:03.841722
2023-03-30 12:51:45.202108
2023-03-30 12:51:45.614662
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:51:45.63203

In [18]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 2.248669242740133,
  'r2': 0.5218552244908461},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2359360973122566,
  'r2': 0.5272549064971541},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.2360236237168487,
  'r2': 0.5272178942692611},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.2360378067108657,
  'r2': 0.527211896580353},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.235976872853207,
  'r2': 0.5272376639536758},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2360095717631143, 'r2': 0.5272238365064815},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.2478443685822698, 'r2': 0.5222059536433759},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.243987021377721, 'r2': 0.5238443555067731}}

### Multivaraite simulation with 5 variables 

In [19]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:58:16.628826
2023-03-30 12:58:58.483103
2023-03-30 12:58:58.970655
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:58:58.985671
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat']  =  2.2486776480586284 0.5218516499619114
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:59:04.669150
2023-03-30 12:59:49.382733
2023-03-30 12:59:49.879853
run_multiple_imputation iteration 1200 stopped 2023-03-30 12

In [20]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.2486776480586284, 'r2': 0.5218516499619114},
 ('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour'): {'rmse': 2.235920241374865, 'r2': 0.5272616113310138},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.236042223748235, 'r2': 0.5272100287006805},
 ('hour', 'temp', 'feelslike', 'dew', 'snowdepth'): {'rmse': 2.236046220001326,
  'r2': 0.5272083387593614},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.235977525862421, 'r2': 0.5272373878164669},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.235955324668091, 'r2': 0.527246775963169},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.247895038096743, 'r2': 0.5221844131261192},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.2439646440280234, 'r2': 0.5238

### Multivaraite simulation with 6 variables 

In [21]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:06:03.048893
2023-03-30 13:06:36.712852
2023-03-30 13:06:37.095618
run_multiple_imputation iteration 1200 stopped 2023-03-30 13:06:37.108603
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lon']  =  2.2486932255824486 0.5218450252746267
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:06:39.944360
2023-03-30 13:07:07.964307
2023-03-30 13:07:08.280894
run_multiple_imputation iteration 1200 stopped 20

In [22]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 2.2486932255824486, 'r2': 0.5218450252746267},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.236046188794811, 'r2': 0.5272083519560296},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.2360434255331394, 'r2': 0.5272095204884497},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.2359819079719836, 'r2': 0.5272355347570601},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.2465151587511647, 'r2': 0.522770851089309},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.247930169659245, 'r2': 0.5221694777852384},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.243967002258681, 'r2': 0.5238528512546705}}

### Multivaraite simulation with 7 variables 

In [23]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:11:06.900427
2023-03-30 13:11:34.857101
2023-03-30 13:11:35.177449
run_multiple_imputation iteration 1200 stopped 2023-03-30 13:11:35.183469
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lat', 'end_lon']  =  2.2488000440263765 0.521799597127392
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:11:37.969537
2023-03-30 13:12:04.570884
2023-03-30 13:12:04.893122
run_multiple_imputation iteration 1200 

In [24]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 2.2488000440263765, 'r2': 0.521799597127392},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.236034427602278, 'r2': 0.5272133255375275},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.236037718693795, 'r2': 0.5272119338010324},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2359704115536005, 'r2': 0.5272403962312047},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.2466070914333085, 'r2': 0.5227317916125208},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.248068453355781, 'r2': 0.5221106875219175},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_l

### with all the variables

In [25]:
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']

X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
X_train_data = X_train_encoded.copy()
X_test_data = X_test_encoded.copy()
y_predict = run_multiple_imputation(
    X_train_data, X_test_data, y_train, y_test, random_state=1200
)
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
r2 = r2_score(y_test, y_predict)
print("results are saving for the columns", cols, " = ", rmse, r2)   

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 2523 for the train
Train set after Amputation (252346, 21)
number of missing values produced 631 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 13:17:20.650491
2023-03-30 13:18:15.339309
2023-03-30 13:18:15.652081
run_multiple_imputation iteration 1200 stopped 2023-03-30 13:18:15.661482
results are saving for the columns ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour', 'temp', 'feelslike', 'dew', 'snowdepth', 'winddir', 'sealevelpressure', 'visibility', 'solarradiation', 'dist', 'birthyear']  =  2.2460181742163265 0.5229819774386011
