In [None]:
import pandas as pd

from source.expert_knowledge import check_expert_knowledge
from source.statistical_analysis import check_statistical_analysis

In [None]:
data_real_path = "data/copd.csv"
data_real = pd.read_csv(data_real_path, index_col=0)

In [None]:
expert_knowledge_results = check_expert_knowledge(data_real)
expert_knowledge_results

In [None]:
statistical_analysis_results = check_statistical_analysis(data_real, data_real_path)
statistical_analysis_results

In [None]:
data_synthetic_path = "data/copd_synthetic.csv"
data_synthetic = pd.read_csv(data_synthetic_path, index_col=0)

In [None]:
expert_knowledge_results = check_expert_knowledge(data_synthetic)
expert_knowledge_results

In [None]:
statistical_analysis_results = check_statistical_analysis(data_synthetic, data_synthetic_path)
statistical_analysis_results

In [1]:
import joblib

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
model = joblib.load('../models/catboost.joblib')

In [3]:
def check_extra_target_col(data):
    if "copd" in data.columns.tolist():
        return data.drop(columns=["copd"])
    return data

In [4]:
def transform_features(X_train, X_test):
    scale_features = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15]
    cat_features = [13]

    transformers = [
        ("one_hot", OneHotEncoder(), cat_features),
        ("scale", MinMaxScaler(), scale_features),
    ]
    col_transform = ColumnTransformer(
        transformers=transformers, remainder="passthrough"
    )

    imputer = SimpleImputer(strategy="mean")

    pipeline = Pipeline(steps=[("imp", imputer), ("preproc", col_transform)])

    X_train_proc = pipeline.fit_transform(X_train)
    X_test_proc = pipeline.transform(X_test)

    return X_train_proc, X_test_proc

In [5]:
def transform_targets(y_train, y_test):
    label_encoder = LabelEncoder()
    y_train_proc = label_encoder.fit_transform(y_train)
    y_test_proc = label_encoder.transform(y_test)

    label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

    return y_train_proc, y_test_proc, label_mapping

In [6]:
def preprocess_copd(data_path, test_size=0.3):
    data = pd.read_csv(data_path, index_col=0)
    data = check_extra_target_col(data)

    X = data.loc[:, data.columns != "COPDSEVERITY"]
    y = data["COPDSEVERITY"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    X_train_proc, X_test_proc = transform_features(X_train, X_test)

    y_train_proc, y_test_proc, label_mapping = transform_targets(y_train, y_test)

    return X_train_proc, X_test_proc, y_train_proc, y_test_proc, label_mapping

In [7]:
X_train_real, X_test_real, y_train_real, y_test_real, label_mapping_real = preprocess_copd("../data/copd.csv", test_size=0.3)
X_train_synthetic, X_test_synthetic, y_train_synthetic, y_test_synthetic, label_mapping_synthetic = preprocess_copd("../data/copd_synthetic.csv", test_size=0.3)

In [8]:
y_test_real_pred = model.predict(X_test_real).reshape((-1, 1))
y_test_synthetic_pred = model.predict(X_test_synthetic).reshape((-1, 1))

In [9]:
from sklearn.metrics import classification_report

In [10]:
cls_report_real = classification_report(y_test_real, y_test_real_pred, output_dict=True)

In [11]:
cls_report_synthetic = classification_report(y_test_synthetic, y_test_synthetic_pred, output_dict=True)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
for key, value in label_mapping_real.items():
    cls_report_real[key] = cls_report_real.pop(str(value))

In [13]:
for key, value in label_mapping_synthetic.items():
    cls_report_synthetic[key] = cls_report_synthetic.pop(str(value))

In [14]:
adversarial_evaluation_results = {
    'real': cls_report_real,
    'synthetic': cls_report_synthetic
}

In [15]:
adversarial_evaluation_results

{'real': {'accuracy': 0.8709677419354839,
  'macro avg': {'precision': 0.675,
   'recall': 0.71875,
   'f1-score': 0.6944444444444444,
   'support': 31.0},
  'weighted avg': {'precision': 0.8258064516129032,
   'recall': 0.8709677419354839,
   'f1-score': 0.8458781362007168,
   'support': 31.0},
  'MILD': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 7.0},
  'MODERATE': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 13.0},
  'SEVERE': {'precision': 0.7,
   'recall': 0.875,
   'f1-score': 0.7777777777777778,
   'support': 8.0},
  'VERY SEVERE': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 3.0}},
 'synthetic': {'accuracy': 0.5806451612903226,
  'macro avg': {'precision': 0.5281862745098039,
   'recall': 0.5096153846153846,
   'f1-score': 0.47974358974358977,
   'support': 31.0},
  'weighted avg': {'precision': 0.5273561037318153,
   'recall': 0.5806451612903226,
   'f1-score': 0.521852770885029,
   'support': 31.0},
  'MIL