# Hot Deck Imputation Tests/Trials

This notebook is to be used to test the implementation the hotDeckImputation.

The data fed to it should be label encoded then one-hot encoded. It should still have missing values as dictated by the indices affected when creating missing values in the original full data.

Therefore, the data loaded needs to be processed to such a state before it is ready for use.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Mini_DIVA'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
from imputers.hotDeckImputer import hotDeckImputer

pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

## Loading data

In [None]:
# read the data
file_dir = "../Mini_DIVA/datasets\Automobile.csv"

df = pd.read_csv(file_dir)
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
1,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
2,1,158,audi,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
3,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,ohc,five,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
4,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,ohc,four,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


In [None]:
# function for determining a categorical variable vs not (from utils.py)
def iscategorical(x, threshold=0.12):
    """
    determine if x is a categorical variable.


    Inputs:
    ------------------------------------------------------------
    x: pd.DataFrame or np.ndarray, a vector


    Outputs:
    ------------------------------------------------------------
    Bool value
    """
    # convert x to np.ndarray
    if isinstance(x, pd.DataFrame):
        x = x.to_numpy()

    if x.dtype in ["object", "bool", "str"]:
        return True
    elif len(np.unique(x[~np.isnan(x)])) < threshold * len(
        x[~np.isnan(x)]
    ):
        return True
    else:
        return False

In [None]:
# creating a mask for categorical variables the filtering the columns using it
categorical_mask = {col: iscategorical(df[col]) for col in df.columns}
cat_vars = [col for col, val in categorical_mask.items() if val == True]

# making a copy of the original dataframe to keep it unaltered
df_le = df.copy()

for col in cat_vars:
    # label encode the data
    le = LabelEncoder()
    df_le[col] = le.fit_transform(df[col])

df_le.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,4,164,0,1,0,0,3,1,0,99.8,176.6,66.2,54.3,2337,2,2,109,4,3.19,3.4,10.0,102,5500,24,30,13950
1,4,164,0,1,0,0,3,0,0,99.4,176.6,66.4,54.3,2824,2,1,136,4,3.19,3.4,8.0,115,5500,18,22,17450
2,3,158,0,1,0,0,3,1,0,105.8,192.7,71.4,55.7,2844,2,1,136,4,3.19,3.4,8.5,110,5500,19,25,17710
3,3,158,0,1,1,0,3,1,0,105.8,192.7,71.4,55.9,3086,2,1,131,4,3.13,3.4,8.3,140,5500,17,20,23875
4,4,192,1,1,0,1,3,2,0,101.2,176.8,64.8,54.3,2395,2,2,108,4,3.5,2.8,8.8,101,5800,23,29,16430


## Label Encoding

In [None]:
# saving the label encoded dataframe version
df_frac = df_le.copy()  # some of the values in this will be set to nan
random_state = 20

for idx, col in enumerate(df_frac.columns):
    # set fraction missing in the dataframe
    missing = df_frac[col].sample(frac=0.08, random_state=random_state, replace=False).index.to_list()
    df_frac.iloc[missing, idx] = np.nan
    random_state += 2

In [None]:
# separating missing from complete data
missing_idx = {}
complete_idx = {}

for col in df_frac.columns:
    missing_idx[col] = list()
    key = missing_idx[col]
    for idx, rec in enumerate(df_frac[col]):
        if np.isnan(rec):
            key.append(idx)

## One-hot encoding

Impute the data with mean and/or mode for it to work with the ohe encoder.

In [None]:
# dictionary to save the values to be imputed per column
imputed_value = {}

# obtaining the values to use
for col in df_le.columns:
    if col in cat_vars:
        imputed_value[col] = float(df_le[col].mode())
    else:
        imputed_value[col] = float(df_le[col].mean())

In [None]:
# instantiate the ohe encoder
ohe = OneHotEncoder(drop="first", sparse=False)

# fitting to df_le because it is the last version of data that is complete
ohe.fit(df_le[cat_vars])

# one-hot encode the data
cat_transComp = ohe.transform(df_le[cat_vars])
cat_transNames = ohe.get_feature_names_out()

# switch them back to dataframes
cat_oheComp = pd.DataFrame(cat_transComp, columns=cat_transNames, index=df_le.index)
df_oheComp = cat_oheComp.join(df_le[[col for col in df_le.columns if col not in cat_vars]])

df_oheComp.sample(5)

Unnamed: 0,symboling_1,symboling_2,symboling_3,symboling_4,symboling_5,make_1,make_2,make_3,make_4,make_5,make_6,make_7,make_8,make_9,make_10,make_11,make_12,make_13,make_14,make_15,make_16,make_17,fuel_type_1,aspiration_1,num_of_doors_1,body_style_1,body_style_2,body_style_3,body_style_4,drive_wheels_1,drive_wheels_2,engine_type_1,engine_type_2,engine_type_3,engine_type_4,num_of_cylinders_1,num_of_cylinders_2,num_of_cylinders_3,num_of_cylinders_4,fuel_system_1,fuel_system_2,fuel_system_3,fuel_system_4,fuel_system_5,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
142,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,94,97.3,171.7,65.5,55.7,2264,97,3.01,3.4,23.0,52,4800,37,46,7995
153,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,74,104.3,188.8,67.2,57.5,3157,130,3.62,3.15,7.5,162,5100,17,22,18950
51,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,161,93.7,157.3,64.4,50.8,2004,92,2.97,3.23,9.4,68,5500,31,38,6669
122,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,168,94.5,168.7,64.0,52.6,2169,98,3.19,3.03,9.0,70,4800,29,34,8058
35,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,104,93.1,159.1,64.2,54.1,1905,91,3.03,3.15,9.0,68,5000,31,38,6795


In [None]:
# converting previously null records back to null/nan
df_oheMiss = df_oheComp.copy()

for col in missing_idx:
    for missCol in df_oheMiss:
        if col in missCol:
            for idx, rec in enumerate(df_oheMiss[missCol]):
                if idx in missing_idx[col]:
                    df_oheMiss[missCol][idx] = np.nan

The data is now ready to be fed to the imputer.

## Trying the hotDeck script implementation

I have to create new numerical and categorical variables to use in the imputer because data was onehot encoded since the last time cat_vars was created thus cat_vars values should have changed.

In [None]:
num_vars_ = [col for col in df_le.columns if col not in cat_vars]
cat_vars_ = [col for col in df_oheComp.columns if col not in num_vars_]

In [None]:
# instantiating the imputer
hot = hotDeckImputer(num_vars_, cat_vars_, 6)

fitted_df = hot.fit(df_oheMiss)
# imputed_df = hot.transform(df_oheMiss)

In [None]:
print(f"Train data fed to the imputer had an average of {df_oheMiss.isna().sum().mean()} missing values per column.\n")
print(f"The resulting data after imputation has an average of {fitted_df.isna().sum().mean()} missing values per column")

Train data fed to the imputer had an average of 13.0 missing values per column.

The resulting data after imputation has an average of 0.0 missing values per column


In [None]:
all(fitted_df == df_oheComp)

True

In [None]:
fitted_df.sample(5)

Unnamed: 0,symboling_1,symboling_2,symboling_3,symboling_4,symboling_5,make_1,make_2,make_3,make_4,make_5,make_6,make_7,make_8,make_9,make_10,make_11,make_12,make_13,make_14,make_15,make_16,make_17,fuel_type_1,aspiration_1,num_of_doors_1,body_style_1,body_style_2,body_style_3,body_style_4,drive_wheels_1,drive_wheels_2,engine_type_1,engine_type_2,engine_type_3,engine_type_4,num_of_cylinders_1,num_of_cylinders_2,num_of_cylinders_3,num_of_cylinders_4,fuel_system_1,fuel_system_2,fuel_system_3,fuel_system_4,fuel_system_5,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
21,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,101.0,93.7,150.0,64.0,52.6,1837.0,79.0,2.91,3.07,10.1,60.0,5500.0,38.0,42.0,5399.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,83.0,93.7,157.9,64.369739,53.7,2120.0,108.0,3.3029,2.64,8.7,73.0,4400.0,26.0,31.0,7053.0
75,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,194.0,91.3,170.7,67.9,49.7,3139.0,181.0,3.43,3.27,7.8,200.0,5200.0,17.0,23.0,19699.0
51,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,161.0,93.7,157.3,63.768261,50.8,2004.0,92.0,2.97,3.23,9.4,68.0,5500.0,31.0,38.261814,6669.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,164.0,99.4,176.6,66.4,54.3,2824.0,136.0,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


The test and trials of the hot deck imputer is successful.