### Imports

In [61]:
import polars as pl
import pandas as pd
from scipy import stats
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### Constants

In [64]:
RANDOM_STATE = 1
FILES_DIR = "../data"
os.makedirs(FILES_DIR, exist_ok=True)

### Read data

In [9]:
df = pd.read_excel(
    "../data/БЛОК ГЕН ФИНАЛ.xlsx",
    index_col=0,
    header=1,  # [0, 1],
).reset_index(drop=True)

In [10]:
column_mapping = {
    "Возраст": "Age",
    "Индекс коморбидности по Charlson": "Comorbidity Index",
    "Поражение СТВОЛА ЛКА": "Left Main Coronary Artery Lesion",
    "Поражение коронарных артерий": "Coronary Artery Lesion",
    "P2Y12-интибитор": "P2Y12 inhibitor",
    "Комбинированная конечная точка": "combined",
    "Покрытие стентов BMS-1 / DES-2": "BMS or DES",
    "Любой CYP2C19": "All Negative CYP2C19",
}

In [11]:
df = df[list(column_mapping.keys())].rename(columns=column_mapping)

In [12]:
df.isna().sum()

Age                                  0
Comorbidity Index                    0
Left Main Coronary Artery Lesion     2
Coronary Artery Lesion               1
P2Y12 inhibitor                      0
combined                             0
BMS or DES                          11
All Negative CYP2C19                 0
dtype: int64

In [16]:
[col for col in df.columns if col in categorical]

['Comorbidity Index',
 'Left Main Coronary Artery Lesion',
 'Coronary Artery Lesion']

In [17]:
numerical = sorted(
    [
        "Age",
    ]
)
binary = sorted(
    [
        "BMS or DES",
        "All Negative CYP2C19",
        "P2Y12 inhibitor",
    ]
)
categorical = sorted(
    [
        "Comorbidity Index",
        "Left Main Coronary Artery Lesion",
        "Coronary Artery Lesion",
    ]
)  # ,'CYP2C19_2','CYP2C19_3','CYP2C19_17'])

In [22]:
df = df[sorted(df.columns.values)]
not_use = ["combined"]
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(not_use, axis=1).copy(),
    df[not_use].copy(),
    test_size=0.25,
    random_state=RANDOM_STATE,
)

### Analyze outliers

In [31]:
def detect_numerical_outliers(df, cols, z_thresh=3):
    # Copy the dataframe to avoid changes to the original data
    outliers = pd.DataFrame()
    for col in cols:
        if (
            df[col].isnull().sum() / len(df) < 0.5
        ):  # Skip columns with more than 50% missing values
            df.fillna({col: df[col].median()}, inplace=True)
            z_scores = np.abs(stats.zscore(df[col]))
            outliers[col] = z_scores > z_thresh
    return outliers

In [32]:
def detect_categorical_outliers(df, cols, threshold=0.01):
    outliers = pd.DataFrame()
    for col in cols:
        frequencies = df[col].value_counts(normalize=True)
        rare_categories = frequencies[frequencies < threshold].index
        outliers[col] = df[col].isin(rare_categories)
    return outliers

In [36]:
# Example usage
numerical_outliers = detect_numerical_outliers(df.copy(), numerical)
categorical_outliers = detect_categorical_outliers(df.copy(), [categorical + binary])

# Combine the outlier information
outliers = pd.concat([numerical_outliers, categorical_outliers], axis=1)

In [37]:
for col in categorical:
    res = df[col][categorical_outliers[col]]
    if len(res) > 0:
        print(res)

### Imputation

In [39]:
f = 0
for i in numerical + binary + categorical:
    if i not in X_test.columns:
        print(i)
        f = 1
if f == 1:
    raise Exception("Not assigned column type")

In [40]:
X_train.loc[:, sorted(categorical + binary)] = (
    X_train[sorted(categorical + binary)].fillna(-1).astype(int)
)
X_test.loc[:, sorted(categorical + binary)] = (
    X_test[sorted(categorical + binary)].fillna(-1).astype(int)
)

In [41]:
X_train.loc[:, sorted(categorical + binary)] = X_train.loc[
    :, sorted(categorical + binary)
].astype(int)
X_test.loc[:, sorted(categorical + binary)] = X_test.loc[
    :, sorted(categorical + binary)
].astype(int)

In [47]:
def mice_impute_data(
    data_imputers_train,
    data_imputers_test,
    numerical=numerical,
    categorical=categorical,
    binary=binary,
):

    # Separate the data into numerical, categorical, and binary columns
    numerical_cols = numerical
    categorical_cols = categorical
    binary_cols = binary
    non_numerical = sorted(categorical_cols + binary_cols)

    # data_imputers_train['LV Infarct Size'] = data_imputers_train['LV Infarct Size'].fillna(data_imputers_train['LV Infarct Size'].max())

    nums = [
        data_imputers_train.columns.get_loc(numerical[x]) for x in range(len(numerical))
    ]

    # Impute numerical columns using linear regression
    imputer_num = IterativeImputer(
        estimator=LinearRegression(), max_iter=100, skip_complete=True
    )
    data_imputers_train.loc[:, numerical_cols] = imputer_num.fit_transform(
        data_imputers_train
    )[:, nums]
    data_imputers_test.loc[:, numerical_cols] = imputer_num.transform(
        data_imputers_test
    )[:, nums]

    # data_imputers_train[binary_cols + categorical_cols] = data_imputers_train[binary_cols + categorical_cols].fillna(-1).astype(int)
    # data_imputers_test[binary_cols + categorical_cols] = data_imputers_test[binary_cols + categorical_cols].fillna(-1).astype(int)
    cat = [
        data_imputers_train.columns.get_loc((non_numerical)[x])
        for x in range(len((non_numerical)))
    ]
    # Impute binary columns using logistic regression
    imputer_bin = IterativeImputer(
        estimator=LogisticRegression(solver="liblinear"),
        max_iter=100,
        skip_complete=True,
        missing_values=-1,
    )
    # data_imputers_train[data_imputers_train.columns] = imputer_bin.fit_transform(data_imputers_train)
    # data_imputers_test[data_imputers_test.columns] = imputer_bin.transform(data_imputers_test)
    data_imputers_train.loc[:, non_numerical] = imputer_bin.fit_transform(
        data_imputers_train
    )[:, cat]
    data_imputers_test.loc[:, non_numerical] = imputer_bin.transform(
        data_imputers_test
    )[:, cat]

    return data_imputers_train, data_imputers_test


# Example usage
imputed_train, imputed_test = mice_impute_data(X_train.copy(), X_test.copy())
imputed_train.isna().sum().sum()
print(f"Total number of nan values: {imputed_train.isna().sum().sum()}")

Total number of nan values: 0


In [48]:
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from scipy.stats import chi2_contingency


def check_distribution_shift(df1, df2, numerical=numerical):
    """
    Check for distribution shift between two dataframes using the Kolmogorov-Smirnov test for each variable.

    Args:
        df1 (pandas.DataFrame): First dataframe.
        df2 (pandas.DataFrame): Second dataframe.

    Returns:
        dict: Dictionary with variable names as keys and True/False indicating distribution shift.
    """
    variables = set(df1.columns).intersection(df2.columns)
    shift_results = {}

    for variable in variables:
        data1 = df1[variable].values
        data2 = df2[variable].values
        stat, p_value = ks_2samp(data1, data2)

        # Check significance level (commonly set to 0.05)
        alpha = 0.01

        # If p-value is less than the significance level, distribution shift is detected
        if p_value < alpha:
            shift_results[variable] = True
        else:
            shift_results[variable] = False

    return shift_results


shift_results = check_distribution_shift(imputed_train.dropna(), X_train)

for variable, is_shift in shift_results.items():
    if is_shift == True:
        print(f"Distribution shift detected for {variable}: {is_shift}")

if True not in shift_results.values():
    print("Distribution shift not detected.")

Distribution shift not detected.


In [56]:
X_train = imputed_train[sorted(imputed_train.columns)]
X_test = imputed_test[sorted(imputed_train.columns)]
y_train = y_train["combined"]
y_test = y_test["combined"]

In [57]:
cat_features = sorted(binary + categorical)
X_train[cat_features] = X_train[cat_features].astype(int)
X_test[cat_features] = X_test[cat_features].astype(int)

In [67]:
cat_features

['All Negative CYP2C19',
 'BMS or DES',
 'Comorbidity Index',
 'Coronary Artery Lesion',
 'Left Main Coronary Artery Lesion',
 'P2Y12 inhibitor']

In [58]:
X_train[X_train == -1].sum().sum()

0.0

In [59]:
X_test[X_test == -1].sum().sum()

0.0

### Save data

In [66]:
# Saving model
file_name = "xy_train_test.joblib"
file_path = os.path.join(FILES_DIR, file_name)
joblib.dump(
    {
        "X_test": X_test,
        "X_train": X_train,
        "y_train": y_train,
        "y_test": y_test,
    },
    file_path,
)

['../data/xy_train_test.joblib']