# DLMI - Lymphocytosis classification
## Linears models

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

from sklearn import (
    linear_model,
    metrics,
    model_selection,
    svm,
)

warnings.filterwarnings(action="ignore")
plt.style.use("seaborn-whitegrid")
np.random.seed(42)

In [2]:
data_dir = "../../data/dlmi-lymphocytosis-classification/"
compute_age = lambda x: 2024 - int(x.replace("-", "/").split("/")[-1])
gender_to_int = lambda x: 1 if x.lower() == "m" else 0

In [3]:
trainset_data_df = pd.read_csv(data_dir + "trainset/trainset_true.csv")
trainset_data_df["AGE"] = trainset_data_df["DOB"].apply(compute_age)
trainset_data_df["GENDER"] = trainset_data_df["GENDER"].apply(gender_to_int)
trainset_data_df.head(2)

Unnamed: 0,ID,LABEL,GENDER,DOB,LYMPH_COUNT,AGE
0,P26,1,1,11/3/1933,11.2,91
1,P183,1,1,5/15/1942,12.8,82


In [4]:
testset_data_df = pd.read_csv(data_dir + "testset/testset_data.csv")
testset_data_df["AGE"] = testset_data_df["DOB"].apply(compute_age)
testset_data_df["GENDER"] = testset_data_df["GENDER"].apply(gender_to_int)
testset_data_df.head(2)

Unnamed: 0,ID,LABEL,GENDER,DOB,LYMPH_COUNT,AGE
0,P71,-1,1,1/17/1946,5.76,78
1,P16,-1,1,3/5/1940,32.0,84


In [5]:
selected_columns = ["GENDER", "LYMPH_COUNT", "AGE"]
X_train_val = trainset_data_df[selected_columns].to_numpy(dtype=np.float32)
X_test = testset_data_df[selected_columns].to_numpy(dtype=np.float32)
y_train_val = trainset_data_df["LABEL"].to_numpy(dtype=int)
X_train_val.shape, X_test.shape

((163, 3), (42, 3))

In [6]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train_val, y_train_val, train_size=0.75)
np.unique(y_train, return_counts=True), np.unique(y_val, return_counts=True)

((array([0, 1]), array([41, 81])), (array([0, 1]), array([ 9, 32])))

In [7]:
def fit_evaluate(clf):
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_train)
    train_acc = metrics.accuracy_score(y_train, y_pred)
    train_balanced_acc = metrics.balanced_accuracy_score(y_train, y_pred)
    train_f1_score = metrics.f1_score(y_train, y_pred)
    print(f"Train : acc = {train_acc:.3f} bal. acc = {train_balanced_acc:.3f} f1 = {train_f1_score:3f}")

    y_pred = clf.predict(X_val)
    val_acc = metrics.accuracy_score(y_val, y_pred)
    val_balanced_acc = metrics.balanced_accuracy_score(y_val, y_pred)
    val_f1_score = metrics.f1_score(y_val, y_pred)
    print(f"Val : acc = {val_acc:.3f} bal. acc = {val_balanced_acc:.3f} f1 = {val_f1_score:3f}")

    return {
        "train": {"acc": train_acc, "bacc": train_balanced_acc, "f1": train_f1_score},
        "val": {"acc": val_acc, "bacc": val_balanced_acc, "f1": val_f1_score},
    }

### Logistic Regression

In [8]:
lr_params = {
    "C": [1e-6, 1e-3, 1e-2, 1e-1, 1, 10, 100, 100],
    "solver": {
        "lbfgs": ["l2", None], 
        "liblinear": ["l2", "l1"], 
        "newton-cg": ["l2", None],
        "sag": ["l2", None], 
        "saga": ["elasticnet", "l1", "l2", None]
    },
    "l1_ratio": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
}

In [9]:
best_params = {}
best_val_balanced_acc = 0

for C in lr_params["C"]:
    for solver in lr_params["solver"]:
        for penalty in lr_params["solver"][solver]:
            for l1_ratio in lr_params["l1_ratio"]:
                params = dict(C=C, solver=solver, penalty=penalty, l1_ratio=l1_ratio)
                lr = linear_model.LogisticRegression(
                    class_weight="balanced", **params
                )
                print(params)
                scores = fit_evaluate(lr)
                if scores["val"]["bacc"] > best_val_balanced_acc:
                    best_val_balanced_acc = scores["val"]["bacc"] 
                    best_params = params
                print("-" * 80)
                if penalty != "elasticnet":
                    break
            

{'C': 1e-06, 'solver': 'lbfgs', 'penalty': 'l2', 'l1_ratio': 1e-06}
Train : acc = 0.713 bal. acc = 0.784 f1 = 0.724409
Val : acc = 0.512 bal. acc = 0.688 f1 = 0.545455
--------------------------------------------------------------------------------
{'C': 1e-06, 'solver': 'lbfgs', 'penalty': None, 'l1_ratio': 1e-06}
Train : acc = 0.852 bal. acc = 0.871 f1 = 0.880000
Val : acc = 0.829 bal. acc = 0.891 f1 = 0.877193
--------------------------------------------------------------------------------
{'C': 1e-06, 'solver': 'liblinear', 'penalty': 'l2', 'l1_ratio': 1e-06}
Train : acc = 0.664 bal. acc = 0.500 f1 = 0.798030
Val : acc = 0.780 bal. acc = 0.500 f1 = 0.876712
--------------------------------------------------------------------------------
{'C': 1e-06, 'solver': 'liblinear', 'penalty': 'l1', 'l1_ratio': 1e-06}
Train : acc = 0.336 bal. acc = 0.500 f1 = 0.000000
Val : acc = 0.220 bal. acc = 0.500 f1 = 0.000000
-----------------------------------------------------------------------------

In [10]:
print("Best params:", best_params)
print("Best balanced acc:", best_val_balanced_acc)

Best params: {'C': 1e-06, 'solver': 'lbfgs', 'penalty': None, 'l1_ratio': 1e-06}
Best balanced acc: 0.890625


In [11]:
clf = linear_model.LogisticRegression(class_weight="balanced", **best_params)
clf.fit(X_train_val, y_train_val)
y_pred = clf.predict(X_train_val)
acc = metrics.accuracy_score(y_train_val, y_pred)
balanced_acc = metrics.balanced_accuracy_score(y_train_val, y_pred)
print(f"Train : acc = {acc:.2f} bal. acc = {balanced_acc:.2f}")

Train : acc = 0.85 bal. acc = 0.86


In [12]:
y_test_pred = clf.predict(X_test)
submission_df = testset_data_df[["ID"]]
submission_df["LABEL"] = y_test_pred
submission_df = submission_df.rename({"ID": "Id", "LABEL": "Predicted"}, axis=1)
submission_df.head(3)

Unnamed: 0,Id,Predicted
0,P71,0
1,P16,1
2,P114,1


In [13]:
submission_df.to_csv("../../submissions/logistic_regression_2.csv", index=False)

### SVM

In [14]:
svm_params = {
    "C": [1e-6, 1e-3, 1e-2, 1e-1, 1, 10, 100, 100],
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": [2, 3],
    "gamma": ["scale", "auto"],
}

In [15]:
best_params = {}
best_val_balanced_acc = 0

for C in svm_params["C"]:
    for kernel in svm_params["kernel"]:
        for gamma in svm_params["gamma"]:
            for degree in svm_params["degree"]:
                params = dict(C=C, kernel=kernel, gamma=gamma, degree=degree)
                svc = svm.SVC(
                    class_weight="balanced", **params
                )
                print(params)
                scores = fit_evaluate(svc)
                if scores["val"]["bacc"] > best_val_balanced_acc:
                    best_val_balanced_acc = scores["val"]["bacc"] 
                    best_params = params
                print("-" * 80)
                if kernel != "poly":
                    break
            

{'C': 1e-06, 'kernel': 'linear', 'gamma': 'scale', 'degree': 2}
Train : acc = 0.336 bal. acc = 0.500 f1 = 0.000000
Val : acc = 0.220 bal. acc = 0.500 f1 = 0.000000
--------------------------------------------------------------------------------
{'C': 1e-06, 'kernel': 'linear', 'gamma': 'auto', 'degree': 2}
Train : acc = 0.336 bal. acc = 0.500 f1 = 0.000000
Val : acc = 0.220 bal. acc = 0.500 f1 = 0.000000
--------------------------------------------------------------------------------
{'C': 1e-06, 'kernel': 'poly', 'gamma': 'scale', 'degree': 2}
Train : acc = 0.336 bal. acc = 0.500 f1 = 0.000000
Val : acc = 0.220 bal. acc = 0.500 f1 = 0.000000
--------------------------------------------------------------------------------
{'C': 1e-06, 'kernel': 'poly', 'gamma': 'scale', 'degree': 3}
Train : acc = 0.336 bal. acc = 0.500 f1 = 0.000000
Val : acc = 0.220 bal. acc = 0.500 f1 = 0.000000
--------------------------------------------------------------------------------
{'C': 1e-06, 'kernel': 'p

In [16]:
print("Best params:", best_params)
print("Best balanced acc:", best_val_balanced_acc)

Best params: {'C': 1, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 2}
Best balanced acc: 0.890625


In [17]:
clf = svm.SVC(class_weight="balanced", **best_params)
clf.fit(X_train_val, y_train_val)
y_pred = clf.predict(X_train_val)
acc = metrics.accuracy_score(y_train_val, y_pred)
balanced_acc = metrics.balanced_accuracy_score(y_train_val, y_pred)
print(f"Train : acc = {acc:.2f} bal. acc = {balanced_acc:.2f}")

Train : acc = 0.83 bal. acc = 0.80


In [18]:
y_test_pred = clf.predict(X_test)
submission_df = testset_data_df[["ID"]]
submission_df["LABEL"] = y_test_pred
submission_df = submission_df.rename({"ID": "Id", "LABEL": "Predicted"}, axis=1)
submission_df.head(3)

Unnamed: 0,Id,Predicted
0,P71,1
1,P16,1
2,P114,1


In [19]:
submission_df.to_csv("../../submissions/svm.csv", index=False)