In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

seed = 1234
np.random.seed(seed)

In [22]:
df = pd.read_csv('dataset/clinvar_variants_all.csv')

# represent "ref" and "alt" cols in one "mutation" col
# map "mutation" col to numbers
df['mutation'] = df['ref'] + df['alt']
df = pd.get_dummies(df, columns=["mutation"], drop_first=False)

# map "X" and "Y" chromosomes to numbers
chrom_mapping = {
    **{str(i): i for i in range(1, 23)},  # "1"â€“"22"
    "X": 23,
    "Y": 24
}
df["chrom"] = df["chrom"].map(chrom_mapping)
df.head()

mutation_cols = [c for c in df.columns if c.startswith("mutation_")]
feature_cols = ["chrom", "start_pos"] + mutation_cols

X = df[feature_cols]
Y = df["clinical_significance"]


In [26]:
# split data into test and train/validation data
X_pre, Xte, Y_pre, Yte = train_test_split(
    X, Y, test_size=0.15,
    random_state=seed,
)

# split train/validation data
Xtr, Xva, Ytr, Yva = train_test_split(
    X_pre, Y_pre, test_size=0.15, 
    random_state=seed, shuffle=True
)

## Basic Decision Tree

A simple model that only predicts the two majority classes, "benign" and "likely benign", would be **97% accurate**. Here we seek to train a decision tree to be more than 97% accurate by varying values for `criterion`, `max_depth`, `min_samples_leaf`, and `class_weight`.

We will use F1 score along with accuracy in evaluation to measure how well our models are at avoding false negatives (low recall) and false positives (low precision).

In [31]:
DT = DecisionTreeClassifier(random_state=seed)
DT.fit(Xtr, Ytr)

ytr_pred = DT.predict(Xtr)
yte_pred = DT.predict(Xte)

print("Train Accuracy:", accuracy_score(Ytr, ytr_pred))
print("Train F1 Score:", f1_score(Ytr, ytr_pred, average="macro"))
print()
print("Test Accuracy:", accuracy_score(Yte, yte_pred))
print("Test F1 Score:", f1_score(Yte, yte_pred, average="macro"))

Train Accuracy: 0.997773411918795
Train F1 Score: 0.9988421432130744

Test Accuracy: 0.7364438839848676
Test F1 Score: 0.6063313692480359


scikitlearn's default decision tree has pretty poor accuracy and f1 score values. Let's see if we can do better!

## Training on Max Depth

## Training on Min Samples Leaf

## Varying Class Weight

## Conclusion