In [None]:
import env_apps

import pandas as pd

from constants import *
from src.data.datasets import Mask, TableDataset
from src.data.processing.sampling import extract_masks

In [None]:
# FEATURES
AGE = Feature(column="AGE")
CLINICAL_STAGE = Feature(column="CLINICAL_STAGE", transform=MappingEncoding({"T1-T2": 0, "T3a": 1}))
CLINICAL_STAGE_MSKCC = Feature(
    column="CLINICAL_STAGE",
    transform=MappingEncoding(
        {"T1c": 0, "T2": 0.2, "T2a": 0.2, "T2b": 0.4, "T2c": 0.6, "T3": 0.8, "T3a": 0.8, "T3b": 1}
    )
)
GLEASON_GLOBAL = Feature(column="GLEASON_GLOBAL")
GLEASON_PRIMARY = Feature(column="GLEASON_PRIMARY")
GLEASON_SECONDARY = Feature(column="GLEASON_SECONDARY")
PSA = Feature(column="PSA")

CONTINUOUS_FEATURES = [AGE, PSA]
CATEGORICAL_FEATURES = [CLINICAL_STAGE, GLEASON_GLOBAL, GLEASON_PRIMARY, GLEASON_SECONDARY]
MSKCC_CATEGORICAL_FEATURES = [CLINICAL_STAGE_MSKCC, GLEASON_GLOBAL, GLEASON_PRIMARY, GLEASON_SECONDARY]


In [None]:
learning_df = pd.read_csv(LEARNING_TABLE_PATH)

table_dataset = TableDataset(
    df=learning_df,
    ids_col=ID,
    tasks=TABLE_TASKS,
    cont_features=CONTINUOUS_FEATURES,
    cat_features=CATEGORICAL_FEATURES
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import balanced_accuracy_score

masks = extract_masks(os.path.join(MASKS_PATH, "masks.json"), k=5, l=5)

for k, v in masks.items():
    train_mask, valid_mask, test_mask, inner_masks = v[Mask.TRAIN], v[Mask.VALID], v[Mask.TEST], v[Mask.INNER]
    table_dataset.update_masks(train_mask=train_mask, valid_mask=valid_mask, test_mask=test_mask)
    dataframe = table_dataset.imputed_df.copy()
    
    train_dataframe = dataframe.iloc[train_mask + valid_mask]
    test_dataframe = dataframe.iloc[test_mask]
    
    X_train = train_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    y_train = train_dataframe[["PN"]].to_numpy().ravel()
    
    X_test = test_dataframe[
        ["AGE", "CLINICAL_STAGE", "GLEASON_GLOBAL", "GLEASON_PRIMARY", "GLEASON_SECONDARY", "PSA"]
    ].to_numpy()
    y_test = test_dataframe[["PN"]].to_numpy().ravel()

    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = clf.predict(X_test)
    print(roc_auc_score(y_test, y_pred_proba))
    print(balanced_accuracy_score(y_test, y_pred))
    
    clf = MLPClassifier(random_state=0, max_iter=400).fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = clf.predict(X_test)
    print(roc_auc_score(y_test, y_pred_proba))
    print(balanced_accuracy_score(y_test, y_pred))
    
    clf = RandomForestClassifier(random_state=0).fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = clf.predict(X_test)
    print(roc_auc_score(y_test, y_pred_proba))
    print(balanced_accuracy_score(y_test, y_pred))
