In [2]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import plotly.io as pio
pio.renderers.default='notebook'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from itertools import combinations
from sklearn.pipeline import Pipeline

In [3]:
pwd

'/home/jovyan/thesis notebooks'

In [4]:
#!pip install category_encoders

In [5]:
tripdata = pd.read_csv("/home/jovyan/data/final_bike_sharing.csv")

In [6]:
tripdata.head()

Unnamed: 0,index,tripduration,start_station_id,start_station_name,start_lat,start_lon,end_station_id,end_station_name,end_lat,end_lon,...,conditions,date,dist,birthyear,years_old,holiday,day,month,seasons,gender
0,259617,1.02,ss_3273,Manila & 1st,40.721651,-74.042884,es_3273,Manila & 1st,40.721651,-74.042884,...,Clear,2018-09-05,1.783012e-12,1994,24,working_day,weekday,September,autumn,male
1,283363,1.02,ss_3198,Heights Elevator,40.748716,-74.040443,es_3198,Heights Elevator,40.748716,-74.040443,...,cloudy_rain,2018-10-06,1.782506e-12,1994,24,working_day,weekend,October,autumn,male
2,169168,1.02,ss_3213,Van Vorst Park,40.718489,-74.047727,es_3185,City Hall,40.717733,-74.043845,...,Clear,2018-07-09,0.3384331,1963,55,working_day,weekday,July,summer,female
3,144929,1.02,ss_3183,Exchange Place,40.716247,-74.033459,es_3183,Exchange Place,40.716247,-74.033459,...,Clear,2018-07-16,1.783112e-12,1985,33,working_day,weekday,July,summer,male
4,326659,1.02,ss_3272,Jersey & 3rd,40.723332,-74.045953,es_3211,Newark Ave,40.721525,-74.046305,...,cloudy_rain,2018-11-02,0.159532,1987,31,working_day,weekday,November,autumn,male


In [7]:
selected_features = ['tripduration','start_lat','start_lon','end_lat','end_lon','usertype','hour',
                     'temp','feelslike','dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation',
                     'conditions','dist','birthyear','holiday','day','month','gender']

In [8]:
df = tripdata[selected_features]

In [9]:
def split_dataset(df):
    X = df.drop('tripduration', axis=1)
    y = df['tripduration']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Train', X_train.shape, y_train.shape)
    print('Test', X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

In [10]:
def getSampleSize(df, perc):
    return round(perc/100 * df.shape[0])

def getAnIndex(index):
    li = []
    for i in index:
        li.append(i)
    return li  


def induceMissingValues(df , cols_list, perc, name ):
    random.seed(100)
    index = getAnIndex(df.index)
    sample_size = getSampleSize(df, perc)
    print(f"number of missing values produced {sample_size} for the {name}")
    for col in cols_list:
        idx = random.sample(index, sample_size)
        for i in idx:
             df.loc[i,col] = np.NaN
    return df

In [11]:
def ampute_each_variables(col_list, X_train, X_test):
    X_train = induceMissingValues(X_train, col_list, perc=5, name="train")
    print('Train set after Amputation', X_train.shape)
    X_test = induceMissingValues(X_test, col_list, perc=5, name="test")
    print('Test set after Amputation', X_test.shape)
    return X_train , X_test 

In [12]:
def one_hot_encoding(X_train, X_test):
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X_train_encoded = ohe.fit_transform(X_train[categorical_features])
    X_test_encoded = ohe.fit_transform(X_test[categorical_features])
    X_train_new = np.hstack((X_train[numerical_cols].values, X_train_encoded))
    X_test_new = np.hstack((X_test[numerical_cols].values, X_test_encoded))
    return X_train_new, X_test_new

In [13]:
def model_evaluation(X_train,y_train):
    print(datetime.datetime.now())
    model_lr = LinearRegression(fit_intercept=True).fit(X_train, y_train) 
    print(datetime.datetime.now())
    return model_lr

In [14]:
def run_multiple_imputation(X_train_amputed, X_test_amputed, y_train, y_test, random_state=0):
    print(f"run_multiple_imputation iteration {random_state} started" ,  datetime.datetime.now())
    impute_estimator = BayesianRidge()
    imputer = IterativeImputer(
        estimator=impute_estimator, max_iter=5, sample_posterior=False, random_state=random_state, tol=1e-1
    )
    ## Fit and tranform the missing values
    X_train_imputed = imputer.fit_transform(X_train_amputed)
    X_test_imputed = imputer.fit_transform(X_test_amputed)

    predictors = [var for var in X_train_amputed.columns]
    
    ## adjust the columns names to the imputed cols
    X_train_imputed = pd.DataFrame(X_train_imputed)
    X_test_imputed = pd.DataFrame(X_test_imputed)
    
    ## Run Linear regression on the 
    model_reg = model_evaluation(X_train_imputed,y_train)
    y_predict = model_reg.predict(X_test_imputed)
    
    print(f"run_multiple_imputation iteration {random_state} stopped" ,  datetime.datetime.now())
    return y_predict

### Multiple Imputation simulation with 2 variables

In [15]:
results ={}
col_list = [ ['dist','birthyear'],['start_lat','start_lon'],['end_lat','end_lon'], ['hour','temp'], 
            ['feelslike', 'dew'], ['snowdepth','winddir'],['sealevelpressure','visibility'],
             ['solarradiation','dist']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}
    

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 10:48:29.313910
2023-03-30 10:49:05.707587
2023-03-30 10:49:06.015324
run_multiple_imputation iteration 1200 stopped 2023-03-30 10:49:06.020047
results are saving for the columns ['dist', 'birthyear']  =  2.2915111558741845 0.5034623240045066
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 10:49:31.820088
2023-03-30 10:50:05.528314
2023-03-30 10:50:05.908491
run_multiple_imputation iteration 1200 stopped 2023-03-30 10:50:05.912968
results are saving 

In [16]:
results

{('dist', 'birthyear'): {'rmse': 2.2915111558741845, 'r2': 0.5034623240045066},
 ('start_lat', 'start_lon'): {'rmse': 2.236518278755649,
  'r2': 0.5270086926166712},
 ('end_lat', 'end_lon'): {'rmse': 2.2362419248782834,
  'r2': 0.5271255751007236},
 ('hour', 'temp'): {'rmse': 2.236285533520417, 'r2': 0.5271071320043772},
 ('feelslike', 'dew'): {'rmse': 2.2359825133342275, 'r2': 0.5272352787676935},
 ('snowdepth', 'winddir'): {'rmse': 2.2359584870695586,
  'r2': 0.5272454386944586},
 ('sealevelpressure', 'visibility'): {'rmse': 2.2359974056369087,
  'r2': 0.5272289812419038},
 ('solarradiation', 'dist'): {'rmse': 2.2909014517811723,
  'r2': 0.5037265171569979}}

### Multivariate simulation with 3 variables

In [17]:
results ={}
col_list = [ ['dist','birthyear','start_lat'],
            ['start_lat','start_lon','end_lat'],
            ['end_lat','end_lon','hour'], 
            ['hour','temp','feelslike'],
            ['feelslike', 'dew','snowdepth'],
            ['snowdepth','winddir','sealevelpressure'],
            ['sealevelpressure','visibility','solarradiation'],
            ['solarradiation','dist','birthyear']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 10:57:19.765164
2023-03-30 10:57:55.702434
2023-03-30 10:57:56.138972
run_multiple_imputation iteration 1200 stopped 2023-03-30 10:57:56.146366
results are saving for the columns ['dist', 'birthyear', 'start_lat']  =  2.291785055357657 0.5033436167133709
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 10:58:33.783866
2023-03-30 10:59:08.052591
2023-03-30 10:59:08.401012
run_multiple_imputation iteration 1200 stopped 2023-03-30 10:59:08.405862
results

In [18]:
results

{('dist', 'birthyear', 'start_lat'): {'rmse': 2.291785055357657,
  'r2': 0.5033436167133709},
 ('start_lat', 'start_lon', 'end_lat'): {'rmse': 2.2365006382595327,
  'r2': 0.527016154007436},
 ('end_lat', 'end_lon', 'hour'): {'rmse': 2.236477173576988,
  'r2': 0.5270260787598677},
 ('hour', 'temp', 'feelslike'): {'rmse': 2.236311726175584,
  'r2': 0.5270960543560406},
 ('feelslike', 'dew', 'snowdepth'): {'rmse': 2.235966596287557,
  'r2': 0.527242009581824},
 ('snowdepth', 'winddir', 'sealevelpressure'): {'rmse': 2.235949636364221,
  'r2': 0.5272491813426894},
 ('sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.23605670498922, 'r2': 0.5272039048380744},
 ('solarradiation', 'dist', 'birthyear'): {'rmse': 2.291044629411829,
  'r2': 0.5036644826405889}}

### Multivariate simulation with 4 variables

In [19]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon'],
            ['start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp'], 
            ['hour','temp','feelslike','dew'],
            ['feelslike', 'dew','snowdepth','winddir'],
            ['snowdepth','winddir','sealevelpressure','visibility'],
            ['sealevelpressure','visibility','solarradiation','dist'],
            ['solarradiation','dist','birthyear','start_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 11:08:21.544440
2023-03-30 11:08:55.699739
2023-03-30 11:08:56.041260
run_multiple_imputation iteration 1200 stopped 2023-03-30 11:08:56.048393
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon']  =  2.291781807913762 0.5033450242298337
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 11:09:46.416767
2023-03-30 11:10:20.398096
2023-03-30 11:10:20.802069
run_multiple_imputation iteration 1200 stopped 2023-03-30 11:10:20.8

In [20]:
results

{('dist', 'birthyear', 'start_lat', 'start_lon'): {'rmse': 2.291781807913762,
  'r2': 0.5033450242298337},
 ('start_lat', 'start_lon', 'end_lat', 'end_lon'): {'rmse': 2.2363455628687596,
  'r2': 0.5270817436259406},
 ('end_lat', 'end_lon', 'hour', 'temp'): {'rmse': 2.2365417790847966,
  'r2': 0.5269987526041874},
 ('hour', 'temp', 'feelslike', 'dew'): {'rmse': 2.2363193660788014,
  'r2': 0.5270928231906311},
 ('feelslike', 'dew', 'snowdepth', 'winddir'): {'rmse': 2.2359836316631023,
  'r2': 0.5272348058600855},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.235952967569409, 'r2': 0.5272477726963343},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.2833771589608944, 'r2': 0.5069811091908154},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.2911897897702103, 'r2': 0.5036015851106702}}

### Multivaraite simulation with 5 variables 

In [21]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat'],
            ['start_lat','start_lon','end_lat','end_lon','hour'],
            ['end_lat','end_lon','hour','temp','feelslike'],
            ['hour','temp','feelslike','dew','snowdepth'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['solarradiation','dist','birthyear','start_lat','start_lon']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 11:21:03.093788
2023-03-30 11:21:37.744200
2023-03-30 11:21:38.080205
run_multiple_imputation iteration 1200 stopped 2023-03-30 11:21:38.085128
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat']  =  2.2918031751160624 0.5033357631590155
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 11:22:40.899627
2023-03-30 11:23:14.908309
2023-03-30 11:23:15.243736
run_multiple_imputation iteration 1200 stopped 2023-03-3

In [22]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.2918031751160624, 'r2': 0.5033357631590155},
 ('start_lat',
  'start_lon',
  'end_lat',
  'end_lon',
  'hour'): {'rmse': 2.2365716635495168, 'r2': 0.52698611212176},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike'): {'rmse': 2.2365869093223427, 'r2': 0.5269796634244306},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.2363549094853417, 'r2': 0.5270777905742055},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.236001914152918, 'r2': 0.5272270747120925},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.2360606172649238, 'r2': 0.5272022503985513},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.283620014124155, 'r2': 0.5068762307216766},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.29127197051464, 'r2':

### Multivaraite simulation with 6 variables 

In [23]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew'],
            ['hour','temp','feelslike','dew','snowdepth','winddir'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 11:36:01.312174
2023-03-30 11:36:37.175010
2023-03-30 11:36:37.525814
run_multiple_imputation iteration 1200 stopped 2023-03-30 11:36:37.530477
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lon']  =  2.2919994197023428 0.5032507018921824
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 11:37:52.594172
2023-03-30 11:38:26.722261
2023-03-30 11:38:27.044790
run_multiple_imputation iteration 1200 stoppe

In [24]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lon'): {'rmse': 2.2919994197023428, 'r2': 0.5032507018921824},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew'): {'rmse': 2.236549969968277, 'r2': 0.5269952880543773},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir'): {'rmse': 2.236347467518134, 'r2': 0.5270809380761547},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility'): {'rmse': 2.235993800780427, 'r2': 0.5272305056357094},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist'): {'rmse': 2.2906060192839686, 'r2': 0.5038545068272053},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat'): {'rmse': 2.283969982222442, 'r2': 0.5067250752662416},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat'): {'rmse': 2.2913398423844953, 'r2': 0.5035365635927707}}

### Multivaraite simulation with 7 variables 

In [25]:
results ={}
col_list = [['dist','birthyear','start_lat','start_lon','end_lat','end_lat','end_lon'],
            ['end_lat','end_lon','hour','temp','feelslike','dew','snowdepth'],
            ['hour','temp','feelslike','dew','snowdepth','winddir','sealevelpressure'],
            ['feelslike', 'dew','snowdepth','winddir','sealevelpressure','visibility','solarradiation'],
            ['snowdepth','winddir','sealevelpressure','visibility','solarradiation','dist','birthyear'],
            ['sealevelpressure','visibility','solarradiation','dist','birthyear','start_lat','start_lon'],
            ['solarradiation','dist','birthyear','start_lat','start_lon','end_lat','end_lat']]
for cols in col_list:
    X_train, X_test, y_train, y_test = split_dataset(df)
    X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
    X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
    X_train_encoded = pd.DataFrame(X_train_encoded)
    X_test_encoded = pd.DataFrame(X_test_encoded)
    X_train_data = X_train_encoded.copy()
    X_test_data = X_test_encoded.copy()
    y_predict = run_multiple_imputation(
        X_train_data, X_test_data, y_train, y_test, random_state=1200
    )
    rmse = np.sqrt(mean_squared_error(y_test, y_predict))
    r2 = r2_score(y_test, y_predict)
    print("results are saving for the columns", cols, " = ", rmse, r2)
    results[tuple(cols)] = {"rmse": rmse, "r2": r2}

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 11:50:43.797241
2023-03-30 11:51:17.391796
2023-03-30 11:51:17.829971
run_multiple_imputation iteration 1200 stopped 2023-03-30 11:51:17.834817
results are saving for the columns ['dist', 'birthyear', 'start_lat', 'start_lon', 'end_lat', 'end_lat', 'end_lon']  =  2.292037547922031 0.5032341745547464
Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 11:52:45.069346
2023-03-30 11:53:21.373051
2023-03-30 11:53:21.747282
run_multiple_imputation iteration 1

In [26]:
results

{('dist',
  'birthyear',
  'start_lat',
  'start_lon',
  'end_lat',
  'end_lat',
  'end_lon'): {'rmse': 2.292037547922031, 'r2': 0.5032341745547464},
 ('end_lat',
  'end_lon',
  'hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth'): {'rmse': 2.236554682210444, 'r2': 0.5269932948817437},
 ('hour',
  'temp',
  'feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure'): {'rmse': 2.236360062914043, 'r2': 0.5270756109794682},
 ('feelslike',
  'dew',
  'snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation'): {'rmse': 2.236113029560979, 'r2': 0.5271800857882933},
 ('snowdepth',
  'winddir',
  'sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear'): {'rmse': 2.2909180109864584, 'r2': 0.5037193427548213},
 ('sealevelpressure',
  'visibility',
  'solarradiation',
  'dist',
  'birthyear',
  'start_lat',
  'start_lon'): {'rmse': 2.2842966924603365, 'r2': 0.5065839442569255},
 ('solarradiation',
  'dist',
  'birthyear',
  'start_l

### with all the variables

In [27]:
cols = ['start_lat','start_lon','end_lat',
                     'end_lon','hour','temp', 'feelslike', 'dew','snowdepth',
                     'winddir','sealevelpressure','visibility','solarradiation','dist','birthyear']

X_train, X_test, y_train, y_test = split_dataset(df)
X_ampute_train , X_ampute_test  = ampute_each_variables(cols, X_train, X_test)
X_train_encoded , X_test_encoded =  one_hot_encoding(X_ampute_train, X_ampute_test)
X_train_encoded = pd.DataFrame(X_train_encoded)
X_test_encoded = pd.DataFrame(X_test_encoded)
X_train_data = X_train_encoded.copy()
X_test_data = X_test_encoded.copy()
y_predict = run_multiple_imputation(
    X_train_data, X_test_data, y_train, y_test, random_state=1200
)
rmse = np.sqrt(mean_squared_error(y_test, y_predict))
r2 = r2_score(y_test, y_predict)
  

Train (252346, 21) (252346,)
Test (63087, 21) (63087,)
number of missing values produced 12617 for the train
Train set after Amputation (252346, 21)
number of missing values produced 3154 for the test
Test set after Amputation (63087, 21)
run_multiple_imputation iteration 1200 started 2023-03-30 12:08:55.827209
2023-03-30 12:10:03.788219
2023-03-30 12:10:04.206036
run_multiple_imputation iteration 1200 stopped 2023-03-30 12:10:04.212027


In [28]:
print("results are saving for the columns", cols, " = ", rmse, r2) 

results are saving for the columns ['start_lat', 'start_lon', 'end_lat', 'end_lon', 'hour', 'temp', 'feelslike', 'dew', 'snowdepth', 'winddir', 'sealevelpressure', 'visibility', 'solarradiation', 'dist', 'birthyear']  =  2.28345331353216 0.506948222579671
