# DLMI - Lymphocytosis classification
## Decisions Trees and Random Forest

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

from sklearn import (
    tree,
    ensemble,
    metrics,
    model_selection
)

warnings.filterwarnings(action="ignore")
plt.style.use("seaborn-whitegrid")
np.random.seed(42)

In [2]:
data_dir = "../../data/dlmi-lymphocytosis-classification/"
compute_age = lambda x: 2024 - int(x.replace("-", "/").split("/")[-1])
gender_to_int = lambda x: 1 if x.lower() == "m" else 0

In [3]:
trainset_data_df = pd.read_csv(data_dir + "trainset/trainset_true.csv")
trainset_data_df["AGE"] = trainset_data_df["DOB"].apply(compute_age)
trainset_data_df["GENDER"] = trainset_data_df["GENDER"].apply(gender_to_int)
trainset_data_df.head(2)

Unnamed: 0,ID,LABEL,GENDER,DOB,LYMPH_COUNT,AGE
0,P26,1,1,11/3/1933,11.2,91
1,P183,1,1,5/15/1942,12.8,82


In [4]:
testset_data_df = pd.read_csv(data_dir + "testset/testset_data.csv")
testset_data_df["AGE"] = testset_data_df["DOB"].apply(compute_age)
testset_data_df["GENDER"] = testset_data_df["GENDER"].apply(gender_to_int)
testset_data_df.head(2)

Unnamed: 0,ID,LABEL,GENDER,DOB,LYMPH_COUNT,AGE
0,P71,-1,1,1/17/1946,5.76,78
1,P16,-1,1,3/5/1940,32.0,84


In [5]:
selected_columns = ["GENDER", "LYMPH_COUNT", "AGE"]
X_train_val = trainset_data_df[selected_columns].to_numpy(dtype=np.float32)
X_test = testset_data_df[selected_columns].to_numpy(dtype=np.float32)
y_train_val = trainset_data_df["LABEL"].to_numpy(dtype=int)
X_train_val.shape, X_test.shape

((163, 3), (42, 3))

In [6]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train_val, y_train_val, train_size=0.75)
np.unique(y_train, return_counts=True), np.unique(y_val, return_counts=True)

((array([0, 1]), array([41, 81])), (array([0, 1]), array([ 9, 32])))

In [7]:
def fit_evaluate(clf):
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_train)
    train_acc = metrics.accuracy_score(y_train, y_pred)
    train_balanced_acc = metrics.balanced_accuracy_score(y_train, y_pred)
    train_f1_score = metrics.f1_score(y_train, y_pred)
    print(f"Train : acc = {train_acc:.3f} bal. acc = {train_balanced_acc:.3f} f1 = {train_f1_score:3f}")

    y_pred = clf.predict(X_val)
    val_acc = metrics.accuracy_score(y_val, y_pred)
    val_balanced_acc = metrics.balanced_accuracy_score(y_val, y_pred)
    val_f1_score = metrics.f1_score(y_val, y_pred)
    print(f"Val : acc = {val_acc:.3f} bal. acc = {val_balanced_acc:.3f} f1 = {val_f1_score:3f}")

    return {
        "train": {"acc": train_acc, "bacc": train_balanced_acc, "f1": train_f1_score},
        "val": {"acc": val_acc, "bacc": val_balanced_acc, "f1": val_f1_score},
    }

### Decision Tree

In [8]:
dt_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["best", "random"]
}

In [9]:
best_params = {}
best_val_balanced_acc = 0

for criterion in dt_params["criterion"]:
    for splitter in dt_params["splitter"]:
        params = dict(criterion=criterion, splitter=splitter)
        dt = tree.DecisionTreeClassifier(
            class_weight="balanced", **params
        )
        print(params)
        scores = fit_evaluate(dt)
        if scores["val"]["bacc"] > best_val_balanced_acc:
            best_val_balanced_acc = scores["val"]["bacc"] 
            best_params = params
        print("-" * 80)

            

{'criterion': 'gini', 'splitter': 'best'}
Train : acc = 1.000 bal. acc = 1.000 f1 = 1.000000
Val : acc = 0.829 bal. acc = 0.771 f1 = 0.888889
--------------------------------------------------------------------------------
{'criterion': 'gini', 'splitter': 'random'}
Train : acc = 1.000 bal. acc = 1.000 f1 = 1.000000
Val : acc = 0.805 bal. acc = 0.715 f1 = 0.875000
--------------------------------------------------------------------------------
{'criterion': 'entropy', 'splitter': 'best'}
Train : acc = 1.000 bal. acc = 1.000 f1 = 1.000000
Val : acc = 0.780 bal. acc = 0.780 f1 = 0.847458
--------------------------------------------------------------------------------
{'criterion': 'entropy', 'splitter': 'random'}
Train : acc = 1.000 bal. acc = 1.000 f1 = 1.000000
Val : acc = 0.805 bal. acc = 0.715 f1 = 0.875000
--------------------------------------------------------------------------------
{'criterion': 'log_loss', 'splitter': 'best'}
Train : acc = 1.000 bal. acc = 1.000 f1 = 1.000000
V

In [10]:
print("Best params:", best_params)
print("Best balanced acc:", best_val_balanced_acc)

Best params: {'criterion': 'log_loss', 'splitter': 'best'}
Best balanced acc: 0.7951388888888888


In [11]:
clf = tree.DecisionTreeClassifier(class_weight="balanced", **best_params)
clf.fit(X_train_val, y_train_val)
y_pred = clf.predict(X_train_val)
acc = metrics.accuracy_score(y_train_val, y_pred)
balanced_acc = metrics.balanced_accuracy_score(y_train_val, y_pred)
print(f"Train : acc = {acc:.2f} bal. acc = {balanced_acc:.2f}")

Train : acc = 1.00 bal. acc = 1.00


In [12]:
y_test_pred = clf.predict(X_test)
submission_df = testset_data_df[["ID"]]
submission_df["LABEL"] = y_test_pred
submission_df = submission_df.rename({"ID": "Id", "LABEL": "Predicted"}, axis=1)
submission_df.head(3)

Unnamed: 0,Id,Predicted
0,P71,0
1,P16,1
2,P114,1


In [13]:
submission_df.to_csv("../../submissions/decision_tree.csv", index=False)

### Random Forest

In [14]:
rf_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "n_estimators": [5, 10, 20, 30, 50, 100, 200, 500, 1000]
}

In [15]:
best_params = {}
best_val_balanced_acc = 0

for criterion in rf_params["criterion"]:
    for n_estimators in rf_params["n_estimators"]:
        params = dict(criterion=criterion, n_estimators=n_estimators)
        rf = ensemble.RandomForestClassifier(
            class_weight="balanced", **params
        )
        print(params)
        scores = fit_evaluate(rf)
        if scores["val"]["bacc"] > best_val_balanced_acc:
            best_val_balanced_acc = scores["val"]["bacc"] 
            best_params = params
        print("-" * 80)
            

{'criterion': 'gini', 'n_estimators': 5}
Train : acc = 0.984 bal. acc = 0.982 f1 = 0.987654
Val : acc = 0.829 bal. acc = 0.811 f1 = 0.885246
--------------------------------------------------------------------------------
{'criterion': 'gini', 'n_estimators': 10}
Train : acc = 0.967 bal. acc = 0.969 f1 = 0.975000
Val : acc = 0.780 bal. acc = 0.700 f1 = 0.857143
--------------------------------------------------------------------------------
{'criterion': 'gini', 'n_estimators': 20}
Train : acc = 1.000 bal. acc = 1.000 f1 = 1.000000
Val : acc = 0.878 bal. acc = 0.882 f1 = 0.918033
--------------------------------------------------------------------------------
{'criterion': 'gini', 'n_estimators': 30}
Train : acc = 1.000 bal. acc = 1.000 f1 = 1.000000
Val : acc = 0.829 bal. acc = 0.771 f1 = 0.888889
--------------------------------------------------------------------------------
{'criterion': 'gini', 'n_estimators': 50}
Train : acc = 1.000 bal. acc = 1.000 f1 = 1.000000
Val : acc = 0.85

In [16]:
print("Best params:", best_params)
print("Best balanced acc:", best_val_balanced_acc)

Best params: {'criterion': 'gini', 'n_estimators': 20}
Best balanced acc: 0.8819444444444444


In [17]:
clf = ensemble.RandomForestClassifier(class_weight="balanced", **best_params)
clf.fit(X_train_val, y_train_val)
y_pred = clf.predict(X_train_val)
acc = metrics.accuracy_score(y_train_val, y_pred)
balanced_acc = metrics.balanced_accuracy_score(y_train_val, y_pred)
print(f"Train : acc = {acc:.2f} bal. acc = {balanced_acc:.2f}")

Train : acc = 1.00 bal. acc = 1.00


In [18]:
y_test_pred = clf.predict(X_test)
submission_df = testset_data_df[["ID"]]
submission_df["LABEL"] = y_test_pred
submission_df = submission_df.rename({"ID": "Id", "LABEL": "Predicted"}, axis=1)
submission_df.head(3)

Unnamed: 0,Id,Predicted
0,P71,0
1,P16,1
2,P114,1


In [19]:
submission_df.to_csv("../../submissions/random_forest.csv", index=False)