In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, mean_squared_error
from math import sqrt

In [6]:
df = pd.read_csv("data/train.csv")
pd.options.mode.chained_assignment = None
df_miss = df.copy()

# Removing values
Created random missing values and saving true values to lists to test accuracy of imputation methods

In [7]:
np.random.seed(42)

row_count = df_miss.shape[0]

chosen_idx1 = list(np.random.choice(row_count, replace=False, size=round(row_count*0.12)))
chosen_idx2 = list(np.random.choice(row_count, replace=False, size=round(row_count*0.03)))
chosen_idx3 = list(np.random.choice(row_count, replace=False, size=round(row_count*0.23)))

df_miss["Hepatomegaly"].iloc[chosen_idx1] = None
df_miss["Cholesterol"].iloc[chosen_idx2] = None
df_miss["Copper"].iloc[chosen_idx3] = None

hep_true = df["Hepatomegaly"].iloc[chosen_idx1].tolist()
chol_true = df["Cholesterol"].iloc[chosen_idx2].tolist()
cop_true = df["Copper"].iloc[chosen_idx3].tolist()

In [8]:
df_miss.isna().sum()

id                  0
N_Days              0
Drug                0
Age                 0
Sex                 0
Ascites             0
Hepatomegaly      949
Spiders             0
Edema               0
Bilirubin           0
Cholesterol       237
Albumin             0
Copper           1818
Alk_Phos            0
SGOT                0
Tryglicerides       0
Platelets           0
Prothrombin         0
Stage               0
Status              0
dtype: int64

# Imputing values

In [9]:
df_imp1 = df_miss.copy()
df_imp2 = df_miss.copy()
df_imp3 = df_miss.copy()

For the first imputation method I simply took the mode of the categorical value and the mean of the numerical values

In [10]:
df_imp1["Hepatomegaly"] = df_imp1["Hepatomegaly"].fillna(df_imp1["Hepatomegaly"].mode().iat[0])
df_imp1["Cholesterol"] = df_imp1["Cholesterol"].fillna(df_imp1["Cholesterol"].mean())
df_imp1["Copper"] = df_imp1["Copper"].fillna(df_imp1["Copper"].mean())

hep_pred1 = df_imp1["Hepatomegaly"].iloc[chosen_idx1].tolist()
chol_pred1 = df_imp1["Cholesterol"].iloc[chosen_idx2].tolist()
cop_pred1 = df_imp1["Copper"].iloc[chosen_idx3].tolist()

df_imp1.isna().sum()

id               0
N_Days           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
dtype: int64

In [20]:
hep_score1 = f1_score(hep_true, hep_pred1, pos_label="Y")
chol_score1 = sqrt(mean_squared_error(chol_true, chol_pred1))
cop_score1 = sqrt(mean_squared_error(cop_true, cop_pred1))

print(f"Hepatomgegaly F1 Score: {hep_score1}")
print(f"Cholesterol RMSE Score: {chol_score1}")
print(f"Copper RMSE Score: {cop_score1}")

Hepatomgegaly F1 Score: 0.6586572438162545
Cholesterol RMSE Score: 162.25561604094258
Copper RMSE Score: 77.15622803998464
