# Baseline: Majority Class Classifier

In [1]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

seed = 1234
np.random.seed(seed)

In [2]:
df = pd.read_csv('data/humsavar_dbnsfp53_complete.csv')

## preprocessing the data

# represent "ref" and "alt" cols in one "mutation" col
# map "mutation" col to numbers
df['mutation'] = df['ref'] + df['alt']
df = pd.get_dummies(df, columns=["mutation"], drop_first=False)

# encode chromosomes ('X' and 'Y')
chrom_mapping = {
    **{str(i): i for i in range(1, 23)},  # "1"–"22"
    "X": 23,
    "Y": 24
}
df["chr"] = df["chr"].map(chrom_mapping)

# encode labels
df['Label'] = df['Label'].map({'Pathogenic': 1, 'Benign': 0})

# encode SIFT, PolyPhen predictions
df = pd.get_dummies(df, columns=['SIFT_pred', 'Polyphen2_HDIV_pred'], drop_first=False)

# remove unnecessary columns
drop_cols = ['Gene', 'Entry', 'FTId', 'AA_change', 'Category',
             'dbSNP', 'Disease', 'rs_dbSNP', 'ref', 'alt']
df = df.drop(columns=drop_cols)


X = df.drop(columns=["Label"])
Y = df["Label"]

df.head()

Unnamed: 0,Label,chr,pos,SIFT_score,Polyphen2_HDIV_score,CADD_raw,CADD_phred,REVEL_score,mutation_AC,mutation_AG,...,mutation_GC,mutation_GT,mutation_TA,mutation_TC,mutation_TG,SIFT_pred_D,SIFT_pred_T,Polyphen2_HDIV_pred_B,Polyphen2_HDIV_pred_D,Polyphen2_HDIV_pred_P
0,1,1,93998027,0.049,0.765,4.435338,25.3,0.86,False,True,...,False,False,False,False,False,True,False,False,False,True
1,0,1,93998061,0.053,0.975,2.412747,18.61,0.503,False,False,...,False,False,False,False,False,False,True,False,True,False
2,0,1,93998061,0.268,0.061,1.492491,14.14,0.313,False,False,...,False,False,False,False,False,False,True,True,False,False
3,1,1,94000836,1.0,0.051,2.740497,20.1,0.577,False,False,...,False,False,False,True,False,False,True,True,False,False
4,1,1,94000866,0.0,1.0,4.731917,26.5,0.937,False,False,...,False,False,False,False,False,True,False,False,True,False


In [3]:
Xtr, Xte, Ytr, Yte = train_test_split(
    X, Y, test_size=0.15,
    random_state=seed,
)

In [4]:
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(Xtr, Ytr)

y_pred = dummy.predict(Xte)

print("Baseline Accuracy:", accuracy_score(Yte, y_pred))
print("Baseline F1 Score:", f1_score(Yte, y_pred))

Baseline Accuracy: 0.6892138939670932
Baseline F1 Score: 0.0
