In [19]:
import pandas as pd
import numpy as np
df = pd.read_excel("../@data/clinic_clean_use2.xlsx", index_col='Sample ID')
df

Unnamed: 0_level_0,day,Age,Stage,Genome Altered,Histologic Grade,MSIMANTIS Score,MSIsensor Score,Mutation Count,New Neoplasm Event,Radiation Therapy,Sex,Weight
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
TCGA-2V-A95S-01,1/1/2020,,STAGE II,0.2041,G3,0.3003,,85.0,,No,Male,78
TCGA-2Y-A9GS-01,1/2/2020,58.0,STAGE IV,0.3002,G2,0.3208,,65.0,Yes,No,Male,92
TCGA-2Y-A9GT-01,1/1/2021,51.0,STAGE I,0.0622,,0.3119,0.02,84.0,Yes,No,Male,122
TCGA-2Y-A9GU-01,1/2/2021,55.0,STAGE I,0.3157,,0.3194,0.09,138.0,No,No,Female,78
TCGA-2Y-A9GV-01,1/1/2022,54.0,STAGE I,0.0928,G1,0.2977,0.03,,Yes,No,Female,85
TCGA-2Y-A9GW-01,1/2/2022,64.0,STAGE I,0.1478,G2,0.3023,0.01,70.0,Yes,No,Male,139
TCGA-2Y-A9GX-01,1/1/2023,68.0,STAGE I,0.0122,G2,0.3126,0.0,55.0,Yes,No,Male,104
TCGA-2Y-A9GY-01,1/2/2023,64.0,STAGE II,,G3,,0.06,68.0,Yes,No,Female,92
TCGA-2Y-A9GZ-01,1/1/2024,82.0,STAGE II,0.4572,G2,0.3133,0.07,82.0,Yes,No,Female,51
TCGA-2Y-A9H0-01,1/2/2024,49.0,STAGE III,0.2037,,0.3054,0.15,73.0,No,No,Male,89


### Preprocessing
Since `IterativeImputer` and `KNNImputer` can only be applied to numeric data, here are the ways we can preprocess our data:
- First, drop columns: we can remove irrelevant columns from the dataframe.
- Second, convert categorical data into numeric.

In [20]:
# drop day from dataframe
df = df.drop(['day'],axis=1)

# convert Stage, Histologic Grade, New Neoplasm Event, Radiation Therapy, Sex into numeric
stage_mapping = {'STAGE I':1, 'STAGE II':2, 'STAGE III':3, 'STAGE IV':4}
df['Stage'] = df['Stage'].replace(stage_mapping)

neoevent_mapping = {'No':0,'Yes':1}
df['New Neoplasm Event'] = df['New Neoplasm Event'].replace(neoevent_mapping)

radiation_mapping = {'No':0, 'Yes':1}
df['Radiation Therapy'] = df['Radiation Therapy'].replace(radiation_mapping)

sex_mapping = {'Female':0, 'Male':1}
df['Sex'] = df['Sex'].replace(sex_mapping)

histologic_mapping = {'G1':1,'G2':2,'G3':3,'G4':4}
df['Histologic Grade'] = df['Histologic Grade'].replace(histologic_mapping)

df.head()

Unnamed: 0_level_0,Age,Stage,Genome Altered,Histologic Grade,MSIMANTIS Score,MSIsensor Score,Mutation Count,New Neoplasm Event,Radiation Therapy,Sex,Weight
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TCGA-2V-A95S-01,,2,0.2041,3.0,0.3003,,85.0,,0.0,1,78
TCGA-2Y-A9GS-01,58.0,4,0.3002,2.0,0.3208,,65.0,1.0,0.0,1,92
TCGA-2Y-A9GT-01,51.0,1,0.0622,,0.3119,0.02,84.0,1.0,0.0,1,122
TCGA-2Y-A9GU-01,55.0,1,0.3157,,0.3194,0.09,138.0,0.0,0.0,0,78
TCGA-2Y-A9GV-01,54.0,1,0.0928,1.0,0.2977,0.03,,1.0,0.0,0,85


### 1. IterativeImputer:
- Complex Relationships: Use IterativeImputer when you believe that the missing values have complex relationships with other features in the dataset. It uses all other features to predict the missing values, which can be useful when the relationships are not simple.
- Non-Linear Relationships: If the relationships between features are non-linear, IterativeImputer can capture these relationships better than simple methods like mean or median imputation.

In [21]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [22]:
imputer = IterativeImputer()
df_imputer = imputer.fit_transform(df)
df_iter_imputed = pd.DataFrame(df_imputer, columns=df.columns)
df_iter_imputed

Unnamed: 0,Age,Stage,Genome Altered,Histologic Grade,MSIMANTIS Score,MSIsensor Score,Mutation Count,New Neoplasm Event,Radiation Therapy,Sex,Weight
0,65.053026,2.0,0.2041,3.0,0.3003,0.131361,85.0,0.661693,0.0,1.0,78.0
1,58.0,4.0,0.3002,2.0,0.3208,0.093247,65.0,1.0,0.0,1.0,92.0
2,51.0,1.0,0.0622,2.0121,0.3119,0.02,84.0,1.0,0.0,1.0,122.0
3,55.0,1.0,0.3157,1.893947,0.3194,0.09,138.0,0.0,0.0,0.0,78.0
4,54.0,1.0,0.0928,1.0,0.2977,0.03,87.567668,1.0,0.0,0.0,85.0
5,64.0,1.0,0.1478,2.0,0.3023,0.01,70.0,1.0,0.0,1.0,139.0
6,68.0,1.0,0.0122,2.0,0.3126,0.0,55.0,1.0,0.0,1.0,104.0
7,64.0,2.0,0.237981,3.0,0.309511,0.06,68.0,1.0,0.0,0.0,92.0
8,82.0,2.0,0.4572,2.0,0.3133,0.07,82.0,1.0,0.0,0.0,51.0
9,49.0,3.0,0.2037,2.149472,0.3054,0.15,73.0,0.0,0.0,1.0,89.0


### 2. KNNImputer:
- Local Relationships: Use KNNImputer when you believe that the missing values are related to nearby data points in the feature space. It imputes missing values based on the values of the nearest neighbors, which can be effective when the relationships are local.
- Small to Medium Sized Datasets: For small to medium sized datasets, KNNImputer can be computationally efficient and provide good imputations without the need for complex model fitting.

In [23]:
from sklearn.impute import KNNImputer

In [24]:
imp_knn = KNNImputer(n_neighbors=2)
df_imp_knn = imp_knn.fit_transform(df)
df_imp_knn = pd.DataFrame(df_imp_knn, columns=df.columns)
df_imp_knn

Unnamed: 0,Age,Stage,Genome Altered,Histologic Grade,MSIMANTIS Score,MSIsensor Score,Mutation Count,New Neoplasm Event,Radiation Therapy,Sex,Weight
0,56.5,2.0,0.2041,3.0,0.3003,0.09,85.0,1.0,0.0,1.0,78.0
1,58.0,4.0,0.3002,2.0,0.3208,0.045,65.0,1.0,0.0,1.0,92.0
2,51.0,1.0,0.0622,2.0,0.3119,0.02,84.0,1.0,0.0,1.0,122.0
3,55.0,1.0,0.3157,2.0,0.3194,0.09,138.0,0.0,0.0,0.0,78.0
4,54.0,1.0,0.0928,1.0,0.2977,0.03,96.0,1.0,0.0,0.0,85.0
5,64.0,1.0,0.1478,2.0,0.3023,0.01,70.0,1.0,0.0,1.0,139.0
6,68.0,1.0,0.0122,2.0,0.3126,0.0,55.0,1.0,0.0,1.0,104.0
7,64.0,2.0,0.1965,3.0,0.30925,0.06,68.0,1.0,0.0,0.0,92.0
8,82.0,2.0,0.4572,2.0,0.3133,0.07,82.0,1.0,0.0,0.0,51.0
9,49.0,3.0,0.2037,1.5,0.3054,0.15,73.0,0.0,0.0,1.0,89.0


### Let's see which one is more closer to the truth!

In [25]:
df = pd.read_excel("../@data/clinic_clean_use.xlsx", index_col='Sample ID')
df

Unnamed: 0_level_0,day,Age,Stage,Genome Altered,Histologic Grade,MSIMANTIS Score,MSIsensor Score,Mutation Count,New Neoplasm Event,Radiation Therapy,Sex,Weight,subtype
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
TCGA-2V-A95S-01,1/1/2020,,STAGE II,0.2041,G3,0.3003,0.0,85,,No,Male,78,
TCGA-2Y-A9GS-01,1/2/2020,58.0,STAGE IV,0.3002,G2,0.3208,0.05,65,Yes,No,Male,92,
TCGA-2Y-A9GT-01,1/1/2021,51.0,STAGE I,0.0622,G2,0.3119,0.02,84,Yes,No,Male,122,HBV-HCV
TCGA-2Y-A9GU-01,1/2/2021,55.0,STAGE I,0.3157,G2,0.3194,0.09,138,No,No,Female,78,
TCGA-2Y-A9GV-01,1/1/2022,54.0,STAGE I,0.0928,G1,0.2977,0.03,61,Yes,No,Female,85,
TCGA-2Y-A9GW-01,1/2/2022,64.0,STAGE I,0.1478,G2,0.3023,0.01,70,Yes,No,Male,139,
TCGA-2Y-A9GX-01,1/1/2023,68.0,STAGE I,0.0122,G2,0.3126,0.0,55,Yes,No,Male,104,
TCGA-2Y-A9GY-01,1/2/2023,64.0,STAGE II,0.4929,G3,0.3115,0.06,68,Yes,No,Female,92,
TCGA-2Y-A9GZ-01,1/1/2024,82.0,STAGE II,0.4572,G2,0.3133,0.07,82,Yes,No,Female,51,
TCGA-2Y-A9H0-01,1/2/2024,49.0,STAGE III,0.2037,G1,0.3054,0.15,73,No,No,Male,89,HBV-HCV


### Conclusion
From this case, we can see that KNNImputer may perform better because we are using a small dataset. Additionally, KNNImputer can correctly impute the information we previously converted from categorical to numeric, as it finds the most similar neighbors and averages the values. This prevents categorical information from being imputed as uninterpretable values. This highlights a weakness of IterativeImputer, which uses a regressor method and may impute categorical information as values that are harder to interpret.