In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.model_selection import RepeatedKFold



from sklearn.metrics import mean_absolute_error, accuracy_score

from constants import CATEGORICAL_ATTRIBUTES, CONTINUOUS_ATTRIBUTES, IGF, PROLACTIN, VITAMINE_D, PCO


In [2]:
dataset_file_path = '../data/preprocessed_dataset.csv'
df = pd.read_csv(dataset_file_path)
df.sample(5)

Unnamed: 0,"PCO 0-healthy control, 1-PCOS, 2-FHA 3-POF, 4-High Andro",IGF-1 ng/ml (N: 100-311),proBNP,"AMH (ng/ml) *7,14=pmol/l",weight,height (cm),BMI,systolic BP (ciśnienie skurczowe),diastolic BP (ciśnienie rozskurczowe),Hypertension,...,HbA1c %,vitamin 25-OH D ng/ml,Androstendione ng/ml,17-OH-progesterone ng/ml,Dihydrotestosterone pg/ml (N<368),Testosterone/DHT,T/A (testosterone/androstendione),month of birth,quarter of the year,age
338,1,455.0,41.43,6.243697,79.0,162.0,30.102119,126.0,71.0,0.0,...,5.2,13.0,3.04,1.78,303.0,0.005611,0.559211,3.0,1.0,19.0
7,0,,,2.4,70.0,166.0,25.402816,115.0,75.0,0.0,...,,,2.71,1.57,184.0,0.004185,0.284133,4.0,2.0,27.0
462,1,177.0,20.99,21.540616,54.0,168.0,19.132653,105.0,60.0,0.0,...,4.7,32.0,4.68,3.77,420.0,0.007619,0.683761,1.0,1.0,21.0
293,1,204.0,,,67.0,161.0,25.847768,126.0,87.0,0.0,...,,,3.32,1.16,268.0,0.006231,0.503012,,,
567,1,350.0,81.09,2.97,67.0,171.0,22.913033,125.0,67.0,0.0,...,5.0,14.0,4.14,3.75,549.0,0.003279,0.434783,4.0,2.0,20.0


In [66]:
import numpy as np

def introduce_missingness(df, attributes, missing_rate=0.1):
    """
    Randomly introduces missing values into specified attributes of the dataframe.
    :param df: pandas DataFrame.
    :param attributes: List of column names where missing values should be introduced.
    :param missing_rate: Fraction of values to be made missing in each specified column.
    :return: DataFrame with missing values introduced.
    """
    df_missing = df.copy()
    for col in attributes:
        missing_indices = np.random.choice(df_missing.index, int(len(df_missing) * missing_rate), replace=False)
        df_missing.loc[missing_indices, col] = np.nan
    return df_missing


def summarize_imputers_categorical(scores_summary_df):
    #  MAximize accuracy
    scores_df = scores_summary_df.drop('Attribute', axis=1)

    # Use the idxmax() function to find the column name of the max value in each row, then count occurrences
    max_scores_count = scores_df.idxmax(axis=1).value_counts()
    max_scores_count_df = max_scores_count.reset_index()
    max_scores_count_df.columns = ['Imputer', 'Highest Score Count']
    print(max_scores_count_df)


def summarize_imputers_continuous(scores_summary_df):
    # Minimize MAE
    scores_df = scores_summary_df.drop('Attribute', axis=1)

    min_scores_count = scores_df.idxmin(axis=1).value_counts()
    min_scores_count_df = min_scores_count.reset_index()
    min_scores_count_df.columns = ['Imputer', 'Lowest Score Count']
    print(min_scores_count_df)


def reshape_results_df(results_df):
    agg_results_df = results_df.groupby(['Attribute', 'Imputer']).agg({'Score':'mean'}).reset_index()
    pivot_df = agg_results_df.pivot(index='Attribute', columns='Imputer', values='Score')
    pivot_df.reset_index(inplace=True)
    pivot_df.rename_axis(None, axis=1, inplace=True)

    sum_score = pivot_df.sum(numeric_only=True)
    print("Sum of each imputer:")
    print(sum_score)

    return pivot_df


In [72]:
continuous_preprocessings = {
    # 'SS_KNN_8': Pipeline([('scaler', StandardScaler()), ('imputer', KNNImputer(n_neighbors=8))]),
    # 'SS_iterative': Pipeline([('scaler', StandardScaler()), ('imputer', IterativeImputer(max_iter=20, initial_strategy='median'))]),
    # 'PT_KNN_8': Pipeline([('scaler', PowerTransformer()), ('imputer', KNNImputer(n_neighbors=8))]),
    'MM_KNN_7': Pipeline([('scaler', MinMaxScaler()), ('imputer', KNNImputer(n_neighbors=7))]),
    # 'MM_KNN_8': Pipeline([('scaler', MinMaxScaler()), ('imputer', KNNImputer(n_neighbors=8))]),
    # 'MM_KNN_9': Pipeline([('scaler', MinMaxScaler()), ('imputer', KNNImputer(n_neighbors=9))]),

    # 'MM_iterative': Pipeline([('scaler', MinMaxScaler()), ('imputer', IterativeImputer(max_iter=40, initial_strategy='median'))]),
    # 'MM_simple': Pipeline([('scaler', MinMaxScaler()), ('imputer', SimpleImputer(strategy='median'))]),
}

categorical_preprocessings = {
    # 'iterative_knn_imputer_10_less_it': Pipeline([('imputer', IterativeImputer(estimator=KNeighborsClassifier(n_neighbors=10), max_iter=10, initial_strategy='most_frequent'))]),
    'iterative_knn_imputer_10': Pipeline([('imputer', IterativeImputer(estimator=KNeighborsClassifier(n_neighbors=10), max_iter=40, initial_strategy='most_frequent'))]),
    # 'impute_knn_1': Pipeline([('imputer', KNNImputer(n_neighbors=1))]),
    # 'simple_imputer': Pipeline([('imputer', SimpleImputer(strategy='most_frequent'))]),
}


df_missing = introduce_missingness(df, CONTINUOUS_ATTRIBUTES + CATEGORICAL_ATTRIBUTES, missing_rate=0.1)

results_cat = []
results_cont = []

for cont_name, cont_imputer in continuous_preprocessings.items():
    for cat_name, cat_imputer in categorical_preprocessings.items():
        imputer = ColumnTransformer(
            verbose_feature_names_out=False,
            transformers=[
                ('cont_imputer', cont_imputer, CONTINUOUS_ATTRIBUTES),
                ('cat_imputer', cat_imputer, CATEGORICAL_ATTRIBUTES)
            ])

        imputer.set_output(transform='pandas')
        imputed_df = imputer.fit_transform(df_missing)

        for attr in CONTINUOUS_ATTRIBUTES:
            original = df[attr].dropna()
            imputed = imputed_df[attr].loc[df[attr].notna()]
            mae = mean_absolute_error(original, imputed)
            results_cont.append({'Attribute': attr, 'Imputer': cont_name, 'Score': mae})

        for attr in CATEGORICAL_ATTRIBUTES:
            accuracy = accuracy_score(df[attr].dropna(), imputed_df[attr].loc[df[attr].notna()])
            results_cat.append({'Attribute': attr, 'Imputer': cat_name, 'Score': accuracy})


results_cat_df = pd.DataFrame(results_cat)
results_cont_df = pd.DataFrame(results_cont)


print('\nFor categorical')
pivot_df_cat = reshape_results_df(results_cat_df)
summarize_imputers_categorical(pivot_df_cat)

print('\nFor continous')
pivot_df_cont = reshape_results_df(results_cont_df)
summarize_imputers_continuous(pivot_df_cont)




For categorical
Sum of each imputer:
iterative_knn_imputer_10    22.412901
dtype: float64
                    Imputer  Highest Score Count
0  iterative_knn_imputer_10                   23

For continous
Sum of each imputer:
MM_KNN_7    5945.480916
MM_KNN_8    5945.480194
MM_KNN_9    5945.479313
dtype: float64
    Imputer  Lowest Score Count
0  MM_KNN_7                  50
1  MM_KNN_9                  36
2  MM_KNN_8                  28


