In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import RobustScaler




from sklearn.metrics import mean_absolute_error, accuracy_score

from constants import CATEGORICAL_ATTRIBUTES, CONTINUOUS_ATTRIBUTES, IGF, PROLACTIN, VITAMINE_D, PCO


In [2]:
dataset_file_path = '../data/preprocessed_dataset.csv'
df = pd.read_csv(dataset_file_path)
df.sample(5)

Unnamed: 0,"PCO 0-healthy control, 1-PCOS, 2-FHA 3-POF, 4-High Andro",IGF-1 ng/ml (N: 100-311),proBNP,"AMH (ng/ml) *7,14=pmol/l",weight,height (cm),BMI,systolic BP (ciśnienie skurczowe),diastolic BP (ciśnienie rozskurczowe),Hypertension,...,FTI (free testosterone index),ACTH pg/ml,HbA1c %,vitamin 25-OH D ng/ml,Androstendione ng/ml,17-OH-progesterone ng/ml,Dihydrotestosterone pg/ml (N<368),Testosterone/DHT,T/A (testosterone/androstendione),age
318,1,,40.07,12.638655,67.0,166.0,24.314124,108.0,71.0,0.0,...,1.85,14.9,5.5,39.0,2.94,1.98,285.0,0.002807,0.272109,24.0
604,1,,,,90.0,165.0,33.057851,110.0,65.0,0.0,...,8.95,57.1,,17.0,6.95,2.87,637.0,0.004553,0.417266,
798,2,210.0,,,50.0,151.0,21.928863,,,,...,1.57,18.28,,,2.18,1.7,298.0,0.00302,0.412844,
387,1,,,,48.0,166.0,17.419074,123.0,87.0,0.0,...,4.55,33.9,,,5.27,6.47,349.0,0.00404,0.267552,19.0
718,1,,51.51,1.204482,49.0,159.0,19.382145,,,,...,,,,,,,,,,29.0


In [3]:
import numpy as np

def introduce_missingness(df, attributes, missing_rate=0.1):
    """
    Randomly introduces missing values into specified attributes of the dataframe.
    :param df: pandas DataFrame.
    :param attributes: List of column names where missing values should be introduced.
    :param missing_rate: Fraction of values to be made missing in each specified column.
    :return: DataFrame with missing values introduced.
    """
    df_missing = df.copy()
    for col in attributes:
        missing_indices = np.random.choice(df_missing.index, int(len(df_missing) * missing_rate), replace=False)
        df_missing.loc[missing_indices, col] = np.nan
    return df_missing


def summarize_imputers_categorical(scores_summary_df):
    #  MAximize accuracy
    scores_df = scores_summary_df.drop('Attribute', axis=1)

    # Use the idxmax() function to find the column name of the max value in each row, then count occurrences
    max_scores_count = scores_df.idxmax(axis=1).value_counts()
    max_scores_count_df = max_scores_count.reset_index()
    max_scores_count_df.columns = ['Imputer', 'Highest Score Count']
    print(max_scores_count_df)


def summarize_imputers_continuous(scores_summary_df):
    # Minimize MAE
    scores_df = scores_summary_df.drop('Attribute', axis=1)

    min_scores_count = scores_df.idxmin(axis=1).value_counts()
    min_scores_count_df = min_scores_count.reset_index()
    min_scores_count_df.columns = ['Imputer', 'Lowest Score Count']
    print(min_scores_count_df)


def reshape_results_df(results_df):
    agg_results_df = results_df.groupby(['Attribute', 'Imputer']).agg({'Score':'mean'}).reset_index()
    pivot_df = agg_results_df.pivot(index='Attribute', columns='Imputer', values='Score')
    pivot_df.reset_index(inplace=True)
    pivot_df.rename_axis(None, axis=1, inplace=True)

    sum_score = pivot_df.sum(numeric_only=True)
    print("Sum of each imputer:")
    print(sum_score)

    return pivot_df


In [7]:
continuous_preprocessings = {
    'SS_KNN_8': Pipeline([('scaler', StandardScaler()), ('imputer', KNNImputer(n_neighbors=7))]),
    'PT_KNN_8': Pipeline([('scaler', PowerTransformer()), ('imputer', KNNImputer(n_neighbors=7))]),
    'MM_KNN_7': Pipeline([('scaler', MinMaxScaler()), ('imputer', KNNImputer(n_neighbors=7))]),
    'RS_KNN_7': Pipeline([('scaler', RobustScaler()), ('imputer', KNNImputer(n_neighbors=7))]),


    # 'MM_iterative': Pipeline([('scaler', MinMaxScaler()), ('imputer', IterativeImputer(max_iter=30, initial_strategy='median'))]),
    # 'SS_iterative': Pipeline([('scaler', StandardScaler()), ('imputer', IterativeImputer(max_iter=30, initial_strategy='median'))]),
    # 'PT_iterative': Pipeline([('scaler', PowerTransformer()), ('imputer', IterativeImputer(max_iter=30, initial_strategy='median'))]),
    # 'RS_iterative': Pipeline([('scaler', RobustScaler()), ('imputer', IterativeImputer(max_iter=30, initial_strategy='median'))]),

}

categorical_preprocessings = {
    # 'iterative_knn_imputer_10_less_it': Pipeline([('imputer', IterativeImputer(estimator=KNeighborsClassifier(n_neighbors=10), max_iter=10, initial_strategy='most_frequent'))]),
    'iterative_knn_imputer_10': Pipeline([('imputer', IterativeImputer(estimator=KNeighborsClassifier(n_neighbors=10), max_iter=40, initial_strategy='most_frequent'))]),
    # 'impute_knn_1': Pipeline([('imputer', KNNImputer(n_neighbors=1))]),
    # 'simple_imputer': Pipeline([('imputer', SimpleImputer(strategy='most_frequent'))]),
}


categorical_without_pco = CATEGORICAL_ATTRIBUTES.copy()
categorical_without_pco.remove(PCO)

df_missing = introduce_missingness(df, CONTINUOUS_ATTRIBUTES + categorical_without_pco, missing_rate=0.1)

results_cat = []
results_cont = []

for cont_name, cont_imputer in continuous_preprocessings.items():
    for cat_name, cat_imputer in categorical_preprocessings.items():
        imputer = ColumnTransformer(
            verbose_feature_names_out=False,
            transformers=[
                ('cont_imputer', cont_imputer, CONTINUOUS_ATTRIBUTES),
                ('cat_imputer', cat_imputer, CATEGORICAL_ATTRIBUTES)
            ])

        imputer.set_output(transform='pandas')
        imputed_df = imputer.fit_transform(df_missing)

        for attr in CONTINUOUS_ATTRIBUTES:
            original = df[attr].dropna()
            imputed = imputed_df[attr].loc[df[attr].notna()]
            mae = mean_absolute_error(original, imputed)
            results_cont.append({'Attribute': attr, 'Imputer': cont_name, 'Score': mae})

        for attr in CATEGORICAL_ATTRIBUTES:
            accuracy = accuracy_score(df[attr].dropna(), imputed_df[attr].loc[df[attr].notna()])
            results_cat.append({'Attribute': attr, 'Imputer': cat_name, 'Score': accuracy})


results_cat_df = pd.DataFrame(results_cat)
results_cont_df = pd.DataFrame(results_cont)


print('\nFor categorical')
pivot_df_cat = reshape_results_df(results_cat_df)
summarize_imputers_categorical(pivot_df_cat)

print('\nFor continous')
pivot_df_cont = reshape_results_df(results_cont_df)
summarize_imputers_continuous(pivot_df_cont)




For categorical
Sum of each imputer:
iterative_knn_imputer_10    22.513258
dtype: float64
                    Imputer  Highest Score Count
0  iterative_knn_imputer_10                   23

For continous
Sum of each imputer:
MM_KNN_7    5522.431867
PT_KNN_8    5558.136547
RS_KNN_7    5533.945942
SS_KNN_8    5557.644382
dtype: float64
    Imputer  Lowest Score Count
0  MM_KNN_7                  77
1  RS_KNN_7                  33
