In [1]:
from __init__ import DATASET_PATH

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import RobustScaler




from sklearn.metrics import mean_absolute_error, accuracy_score

from main.constants import CATEGORICAL_ATTRIBUTES, CONTINUOUS_ATTRIBUTES, IGF, PROLACTIN, VITAMINE_D, PCO


In [2]:
dataset_file_path = DATASET_PATH
df = pd.read_csv(dataset_file_path)
df.sample(5)

Unnamed: 0,"PCO 0-healthy control, 1-PCOS, 2-FHA 3-POF, 4-High Andro",IGF-1 ng/ml (N: 100-311),proBNP,"AMH (ng/ml) *7,14=pmol/l",weight,height (cm),BMI,systolic BP (ciśnienie skurczowe),diastolic BP (ciśnienie rozskurczowe),Hypertension,...,FTI (free testosterone index),ACTH pg/ml,HbA1c %,vitamin 25-OH D ng/ml,Androstendione ng/ml,17-OH-progesterone ng/ml,Dihydrotestosterone pg/ml (N<368),Testosterone/DHT,T/A (testosterone/androstendione),age
55,0,143.0,21.82,,97.0,167.0,34.780738,146.0,99.0,1.0,...,4.88,14.4,5.8,6.0,2.07,1.69,340.0,0.002941,0.483092,36.0
348,1,252.0,,,54.0,162.0,20.576132,103.0,72.0,0.0,...,1.82,31.46,,,3.78,3.02,318.0,0.005975,0.502646,21.0
651,1,159.0,,,95.0,165.0,34.894399,116.0,95.0,0.0,...,7.48,31.55,5.4,,2.56,1.74,852.0,0.001995,0.664062,29.0
865,2,174.0,53.96,4.067227,103.5,150.0,46.0,120.0,72.0,0.0,...,4.34,33.66,4.8,16.0,2.06,1.71,2500.0,0.00044,0.533981,24.0
954,4,,,,54.0,165.0,19.834711,120.0,80.0,0.0,...,,17.1,,,2.3,1.18,448.0,0.002098,0.408696,


In [3]:
import numpy as np

def introduce_missingness(df, attributes, missing_rate=0.1):
    """
    Randomly introduces missing values into specified attributes of the dataframe.
    :param df: pandas DataFrame.
    :param attributes: List of column names where missing values should be introduced.
    :param missing_rate: Fraction of values to be made missing in each specified column.
    :return: DataFrame with missing values introduced.
    """
    df_missing = df.copy()
    for col in attributes:
        missing_indices = np.random.choice(df_missing.index, int(len(df_missing) * missing_rate), replace=False)
        df_missing.loc[missing_indices, col] = np.nan
    return df_missing


def summarize_imputers_categorical(scores_summary_df):
    #  MAximize accuracy
    scores_df = scores_summary_df.drop('Attribute', axis=1)

    # Use the idxmax() function to find the column name of the max value in each row, then count occurrences
    max_scores_count = scores_df.idxmax(axis=1).value_counts()
    max_scores_count_df = max_scores_count.reset_index()
    max_scores_count_df.columns = ['Imputer', 'Highest Score Count']
    print(max_scores_count_df)


def summarize_imputers_continuous(scores_summary_df):
    # Minimize MAE
    scores_df = scores_summary_df.drop('Attribute', axis=1)

    min_scores_count = scores_df.idxmin(axis=1).value_counts()
    min_scores_count_df = min_scores_count.reset_index()
    min_scores_count_df.columns = ['Imputer', 'Lowest Score Count']
    print(min_scores_count_df)


def reshape_results_df(results_df):
    agg_results_df = results_df.groupby(['Attribute', 'Imputer']).agg({'Score':'mean'}).reset_index()
    pivot_df = agg_results_df.pivot(index='Attribute', columns='Imputer', values='Score')
    pivot_df.reset_index(inplace=True)
    pivot_df.rename_axis(None, axis=1, inplace=True)

    sum_score = pivot_df.sum(numeric_only=True)
    print("Sum of each imputer:")
    print(sum_score)

    return pivot_df


In [98]:
from sklearn.neighbors import KNeighborsRegressor


continuous_preprocessings = {
    'MM_KNN_5': Pipeline([('scaler', MinMaxScaler()), ('imputer', KNNImputer(n_neighbors=5))]),
    'MM_KNN_7': Pipeline([('scaler', MinMaxScaler()), ('imputer', KNNImputer(n_neighbors=7))]),
    'MM_KNN_10': Pipeline([('scaler', MinMaxScaler()), ('imputer', KNNImputer(n_neighbors=10))]),



    'MM_iterative': Pipeline([('scaler', MinMaxScaler()), ('imputer', IterativeImputer(max_iter=10, initial_strategy='median'))]),
    'MM_iterative_KNN_7': Pipeline([('scaler', MinMaxScaler()), ('imputer', IterativeImputer(KNeighborsRegressor(n_neighbors=7), max_iter=10, initial_strategy='median'))]),
    'MM_iterative_KNN_5': Pipeline([('scaler', MinMaxScaler()), ('imputer', IterativeImputer(KNeighborsRegressor(n_neighbors=5), max_iter=10, initial_strategy='median'))]),
    'MM_iterative_KNN_10': Pipeline([('scaler', MinMaxScaler()), ('imputer', IterativeImputer(KNeighborsRegressor(n_neighbors=10), max_iter=10, initial_strategy='median'))]),


}

categorical_preprocessings = {
    'iterative_knn_imputer_5': Pipeline([('imputer', IterativeImputer(estimator=KNeighborsClassifier(n_neighbors=5), max_iter=10, initial_strategy='most_frequent'))]),
    'iterative_knn_imputer_7': Pipeline([('imputer', IterativeImputer(estimator=KNeighborsClassifier(n_neighbors=7), max_iter=10, initial_strategy='most_frequent'))]),
    'iterative_knn_imputer_10': Pipeline([('imputer', IterativeImputer(estimator=KNeighborsClassifier(n_neighbors=10), max_iter=10, initial_strategy='most_frequent'))]),
    'impute_knn_1': Pipeline([('imputer', KNNImputer(n_neighbors=1))]),
    'simple_imputer': Pipeline([('imputer', SimpleImputer(strategy='most_frequent'))]),
}


categorical_without_pco = CATEGORICAL_ATTRIBUTES.copy()
categorical_without_pco.remove(PCO)

df_missing = introduce_missingness(df, CONTINUOUS_ATTRIBUTES + categorical_without_pco, missing_rate=0.1)

results_cat = []
results_cont = []

for cont_name, cont_imputer in continuous_preprocessings.items():
    for cat_name, cat_imputer in categorical_preprocessings.items():
        imputer = ColumnTransformer(
            verbose_feature_names_out=False,
            transformers=[
                ('cont_imputer', cont_imputer, CONTINUOUS_ATTRIBUTES),
                ('cat_imputer', cat_imputer, CATEGORICAL_ATTRIBUTES)
            ])

        imputer.set_output(transform='pandas')
        imputed_df = imputer.fit_transform(df_missing)


        imputed_cont_data = imputed_df[CONTINUOUS_ATTRIBUTES]

        imputed_cat_data = imputed_df[CATEGORICAL_ATTRIBUTES]
        # Przekształcanie danych ciągłych z powrotem do oryginalnej skali
        original_scaled_cont_data = imputer.named_transformers_['cont_imputer'].named_steps['scaler'].inverse_transform(imputed_cont_data)

        imputed_scaled_up_cont_data = pd.DataFrame(original_scaled_cont_data, columns=CONTINUOUS_ATTRIBUTES)

        final_imputed_df = pd.concat([imputed_scaled_up_cont_data, imputed_cat_data], axis=1)

        for attr in CONTINUOUS_ATTRIBUTES:
            indices_to_keep = df[attr].notna() & df_missing[attr].isna()

            original = df[attr].loc[indices_to_keep]
            imputed = final_imputed_df[attr].loc[indices_to_keep]

            mae = mean_absolute_error(original, imputed)
            results_cont.append({'Attribute': attr, 'Imputer': cont_name, 'Score': mae})

        for attr in CATEGORICAL_ATTRIBUTES:
            indices_to_keep = df[attr].notna() & df_missing[attr].isna()
            original = df[attr].loc[indices_to_keep]
            imputed = final_imputed_df[attr].loc[indices_to_keep]
            accuracy = accuracy_score(original, imputed)
            results_cat.append({'Attribute': attr, 'Imputer': cat_name, 'Score': accuracy})


results_cat_df = pd.DataFrame(results_cat)
results_cont_df = pd.DataFrame(results_cont)

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dty

In [99]:

def reshape_results_categorical_df(results_df):
    agg_results_df = results_df.groupby(['Attribute', 'Imputer']).agg({'Score':'mean'}).reset_index()
    pivot_df = agg_results_df.pivot(index='Attribute', columns='Imputer', values='Score')
    pivot_df.reset_index(inplace=True)
    pivot_df.rename_axis(None, axis=1, inplace=True)

    avg_score = pd.DataFrame(pivot_df.median(numeric_only=True), columns=['median accuracy'])

    return avg_score, pivot_df


print('\nFor categorical')
avg_score, pivot_df_cat = reshape_results_categorical_df(results_cat_df)
avg_score.sort_values(by='median accuracy', ascending=False)
# pivot_df_cat
# summarize_imputers_categorical(pivot_df_cat)


For categorical


Unnamed: 0,median accuracy
iterative_knn_imputer_10,0.807982
iterative_knn_imputer_7,0.801491
iterative_knn_imputer_5,0.781494
impute_knn_1,0.732569
simple_imputer,0.714021


In [100]:
def reshape_results_df(results_df):
    agg_results_df = results_df.groupby(['Attribute', 'Imputer']).agg({'Score':'mean'}).reset_index()
    pivot_df = agg_results_df.pivot(index='Attribute', columns='Imputer', values='Score')
    pivot_df.reset_index(inplace=True)
    pivot_df.rename_axis(None, axis=1, inplace=True)

    sum_score = pd.DataFrame(pivot_df.sum(numeric_only=True), columns=['sum mae'])
    # because for each imputer each action was repeated 5 times for each categorical imputer
    sum_score /= len(categorical_preprocessings)
    # print("Sum of each imputer:")
    # print(sum_score)

    return sum_score, pivot_df

print('\nFor continous')
sum_score, pivot_df_cont = reshape_results_df(results_cont_df)
sum_score.sort_values(by='sum mae', ascending=True)
# summarize_imputers_continuous(pivot_df_cont)


For continous


Unnamed: 0,sum mae
MM_iterative_KNN_10,297.94687
MM_iterative_KNN_7,299.758874
MM_KNN_10,301.925375
MM_iterative,305.817059
MM_iterative_KNN_5,307.033188
MM_KNN_7,307.535969
MM_KNN_5,318.897142


In [101]:
pivot_df_cont

Unnamed: 0,Attribute,MM_KNN_10,MM_KNN_5,MM_KNN_7,MM_iterative,MM_iterative_KNN_10,MM_iterative_KNN_5,MM_iterative_KNN_7
0,% lymphocytes,2.926034,2.944584,3.052295,0.906577,3.075169,3.584112,3.292247
1,% monocytes,1.369659,1.356591,1.383279,0.463075,1.394295,1.373727,1.405097
2,% neutrophil,3.840870,3.991522,3.862422,1.160987,4.316522,3.925000,4.069876
3,%basophils,0.231420,0.230795,0.240666,0.220896,0.249489,0.252273,0.252029
4,%eosinocytes,1.069773,1.154318,1.088961,0.260375,1.152875,1.172727,1.128782
...,...,...,...,...,...,...,...,...
105,systolic BP (ciśnienie skurczowe),8.782716,8.854321,8.804233,7.385299,8.630864,8.824691,8.673721
106,testosterone nmol/l,0.359352,0.363864,0.365552,0.011288,0.355989,0.318864,0.341591
107,thyroid volume,3.533332,3.659724,3.600419,0.654714,3.614768,3.751767,3.694358
108,vitamin 25-OH D ng/ml,8.429412,8.435294,8.126050,7.629751,8.009804,7.952941,8.190476
