In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

# 1. Read the data

In [2]:
# data_version_1_path = "../data_for_modeling/raw_data/v1/Original Data - v1 - Merge.xlsx"
all_data_path = "../../data_for_modeling/filter_data/all_data/HDAC2_all_final_data.xlsx"
dataset = pd.read_excel(all_data_path, sheet_name='filter_data')
dataset_length = len(dataset)
dataset_length

2138

In [3]:
dataset.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,2.23225,Active,Inactive,12
1,3812,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCCC(=O)NO,1.308333,Active,Inactive,3
2,3994,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCC(=O)NO,1.255333,Active,Inactive,3
3,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,1.526116,Active,Inactive,19
4,5173,C(CCCC(=O)NO)CCC(=O)NO,8.23,Active,Inactive,1


# 2. Profile of the data

## 2.1. Group by original activity

In [56]:
def check_activity_distribution(dataset, col_name, encode):
    dataset_length = len(dataset)
    if not encode:
        active_rows = dataset.loc[dataset[col_name] == "Active"]
        inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]
        inconclusive_rows = dataset.loc[dataset[col_name] == "Inconclusive"]
        unspecified_rows = dataset.loc[dataset[col_name] == "Unspecified"]
        
        print("Total dataset")
        table = [['', 'Active', 'Inactive', 'Inconclusive', 'Unspecified'], 
                ['Number', len(active_rows), len(inactive_rows), len(inconclusive_rows), len(unspecified_rows)],
                ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100,
                len(inconclusive_rows)/dataset_length*100, len(unspecified_rows)/dataset_length*100]]
        print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
    else:
        active_rows = dataset.loc[dataset[col_name] == 0]
        inactive_rows = dataset.loc[dataset[col_name] == 1]
        unspecified_rows = dataset.loc[dataset[col_name] == 2]
        print("Total dataset")
        table = [['', 'Active', 'Inactive'], 
                ['Number', len(active_rows), len(inactive_rows), len(unspecified_rows)],
                ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100,
                 len(unspecified_rows)/dataset_length*100]]
        print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
    

In [5]:
check_activity_distribution(dataset=dataset, col_name='FIRST_LABEL', encode=False)

Total dataset
╒════════════════╤═══════════╤════════════╤════════════════╤═══════════════╕
│                │    Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪═══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 1527      │  13        │              0 │      598      │
├────────────────┼───────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │   71.4219 │   0.608045 │              0 │       27.9701 │
╘════════════════╧═══════════╧════════════╧════════════════╧═══════════════╛


# 3. Filtering the data

## 3.1. Cannonical Smiles and remove duplicates

In [6]:
def make_canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [7]:
cannon_smiles = make_canonical_smiles(dataset.SMILES)
dataset['SMILES'] = cannon_smiles
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicate_index = dataset[dataset['SMILES'].duplicated()]['SMILES'].index
duplicates_smiles

array([], dtype=object)

In [8]:
duplicate_index

Int64Index([], dtype='int64')

In [9]:
dup_smiles_idxs = []
for smiles in duplicates_smiles:
    sub_dataset_dup_smiles = dataset[dataset.SMILES == smiles].copy()
    for idx, _ in sub_dataset_dup_smiles.iterrows():
        dup_smiles_idxs.append(idx)
print(dup_smiles_idxs)
print(len(dup_smiles_idxs))

[]
0


In [10]:
dataset = dataset.drop(dup_smiles_idxs)

In [11]:
#check again
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicates_smiles

array([], dtype=object)

In [12]:
dataset = dataset.reset_index(drop=True)
len(dataset)

2138

In [13]:
check_activity_distribution(dataset, 'FIRST_LABEL', encode=False)

Total dataset
╒════════════════╤═══════════╤════════════╤════════════════╤═══════════════╕
│                │    Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪═══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 1527      │  13        │              0 │      598      │
├────────────────┼───────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │   71.4219 │   0.608045 │              0 │       27.9701 │
╘════════════════╧═══════════╧════════════╧════════════════╧═══════════════╛


## 3.2. Delete unspecified data

In [14]:
# unspecified_rows_idx = dataset.loc[dataset['FIRST_LABEL'] == "Unspecified"].index
# dataset = dataset.drop(unspecified_rows_idx)
# dataset = dataset.reset_index(drop=True)
# len(dataset)

In [15]:
# check_activity_distribution(dataset=dataset, col_name='FIRST_LABEL', encode=False)

# 3. Find labels errors on the data

In [16]:
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

#Encoding labels
dataset_c = dataset.copy()
# Transform letter grades and notes to categorical numbers.
# Necessary for XGBoost.
dataset['FIRST_LABEL'] = preprocessing.LabelEncoder().fit_transform(dataset['FIRST_LABEL'])
dataset.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)Nc1ccc(C(=O)Nc2ccccc2N)cc1,2.23225,0,Inactive,12
1,3812,CN(C)c1ccc(C(=O)NCCCCCCCC(=O)NO)cc1,1.308333,0,Inactive,3
2,3994,CN(C)c1ccc(C(=O)NCCCCCCC(=O)NO)cc1,1.255333,0,Inactive,3
3,4261,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,1.526116,0,Inactive,19
4,5173,O=C(CCCCCCC(=O)NO)NO,8.23,0,Inactive,1


In [17]:
dataset_c.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)Nc1ccc(C(=O)Nc2ccccc2N)cc1,2.23225,Active,Inactive,12
1,3812,CN(C)c1ccc(C(=O)NCCCCCCCC(=O)NO)cc1,1.308333,Active,Inactive,3
2,3994,CN(C)c1ccc(C(=O)NCCCCCCC(=O)NO)cc1,1.255333,Active,Inactive,3
3,4261,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,1.526116,Active,Inactive,19
4,5173,O=C(CCCCCCC(=O)NO)NO,8.23,Active,Inactive,1


In [18]:
# Check for unspec labels
unspec_subset = dataset_c[dataset_c['FIRST_LABEL'] == 'Unspecified'][0:5]
dataset.loc[unspec_subset.index].head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
1540,264,CCCC(=O)O,,2,Inactive,1
1541,3121,CCCC(CCC)C(=O)O,,2,Inactive,2
1542,3810,CN(C)c1ccc(C(=O)NCCCCC(=O)NO)cc1,,2,Inactive,2
1543,4775,O=C(O)CCCc1ccccc1,,2,Inactive,1
1544,53232,CC[C@H](C)C(=O)O[C@H]1C[C@@H](C)C=C2C=C[C@H](C...,,2,Inactive,1


In [19]:
inac_subset = dataset_c[dataset_c['FIRST_LABEL'] == 'Inactive'][0:5]
dataset.loc[inac_subset.index].head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
1527,6917365,COC(=O)[C@H](Cc1ccccc1)n1nnc(-c2ccccc2)c1C#Cc1...,0.0,1,Active,1
1528,60198344,Cn1cc(/C=C/C(=O)Nc2ccccc2N)cn1,15.0,1,Inactive,2
1529,60198412,Cc1cc(/C=C/C(=O)Nc2ccccc2N)on1,15.0,1,Inactive,2
1530,137224531,O=Nc1c(O)n(Cc2ccc(C(=O)NO)cc2)c2ccccc12,30.0,1,Inactive,1
1531,155512415,O=C(NO)[C@H](Cc1ccccc1)n1cc(-c2ccccc2)nn1,0.0,1,Active,1


**We will do this in the MACCS keys**

In [20]:
def maccs_fpts(data):
    Maccs_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i)
        fpts = MACCSkeys.GenMACCSKeys(mol)
        mfpts = np.array(fpts)
        Maccs_fpts.append(mfpts)
    return np.array(Maccs_fpts)

In [21]:
smiles = dataset.SMILES
data = maccs_fpts(smiles)
data = pd.DataFrame(data=data)
labels = dataset['FIRST_LABEL']
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,1,0,0


### 3.1. Getting out-of-sample predicted probabilities

In [22]:
model = XGBClassifier(tree_method="hist", enable_categorical=True)
pred_probs = cross_val_predict(model, data, labels, method='predict_proba')
print(len(pred_probs))
print(pred_probs)

2138
[[9.2634624e-01 4.5108137e-04 7.3202707e-02]
 [9.8972452e-01 1.2777693e-04 1.0147680e-02]
 [9.8972452e-01 1.2777693e-04 1.0147680e-02]
 ...
 [9.3059826e-01 5.8444845e-04 6.8817332e-02]
 [2.1256916e-02 1.1142341e-04 9.7863162e-01]
 [2.8322062e-02 7.7018733e-05 9.7160089e-01]]


### 3.2. Checking model accuracy on original data

Now that we have out-of-sample predicted probabilities, we can also check the model's (cross-val) accuracy on the original (noisy) data, so we'll have a baseline to compare our final results.

In [23]:
preds = np.argmax(pred_probs, axis=1)
acc_original = accuracy_score(preds, labels)
print(f"Accuracy with original data using predict_proba(): {round(acc_original*100,1)}%")

Accuracy with original data using predict_proba(): 73.6%


In [24]:
model.fit(data, labels)
preds_by_predict = model.predict(data)
acc_pred_by_predict = accuracy_score(preds_by_predict, labels)
print(f"Accuracy with original data using predict(): {round(acc_pred_by_predict*100,1)}%")

Accuracy with original data using predict(): 96.7%


### 3.3. Finding the class threshold

In [25]:
def compute_class_thresholds(pred_probs: np.ndarray, labels: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    thresholds = np.zeros(n_classes)
    for k in range(n_classes):
        count = 0
        p_sum = 0
        for i in range(n_examples):
            if labels[i] == k:
                count += 1
                p_sum += pred_probs[i, k]
        thresholds[k] = p_sum / count
    return thresholds

<b>Check the data and its label was right</b>

In [26]:
# should be a numpy array of length 5
thresholds = compute_class_thresholds(pred_probs, labels.to_numpy())
thresholds

array([0.82269753, 0.40187772, 0.4471    ])

### 3.4. Constructing the confident joint

In [27]:
def compute_confident_joint(pred_probs: np.ndarray, labels: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    confident_joint = np.zeros((n_classes, n_classes), dtype=np.int64)
    positions = np.array([[-1, -1]])
    for data_idx in range(n_examples):
        i = labels[data_idx]    #y_noise
        j = None                #y_true -> to find
        #Lưu ý điểm mình bị sai: vị trí của chúng không ứng với label
        p_j = -1
        for candidate_j in range(n_classes):
            p = pred_probs[data_idx, candidate_j]
            if p >= thresholds[candidate_j] and p > p_j:
                j = candidate_j
                p_j = p
        if j is not None:
            confident_joint[i][j] += 1
        positions = np.append(positions, np.array([[i, j]]), axis=0)
    return confident_joint, positions

In [28]:
C, _ = compute_confident_joint(pred_probs, labels.to_numpy(), thresholds)
print(C)
# print(positions)

[[1106   16  229]
 [   4    5    3]
 [ 243    2  280]]


In [29]:
# positions = pd.DataFrame(positions, columns=["i", "j"]).to_excel("../data_for_modeling/filter_data/v1/before_clean_data/positions.xlsx")

### 3.5 Count the number of label issues

In [30]:
num_label_issues = C.sum() - C.trace()
num_label_issues

497

In [31]:
print('Estimated noise rate: {:.1f}%'.format(100*num_label_issues / pred_probs.shape[0]))

Estimated noise rate: 23.2%


### 3.6. Filter out label issues

In [32]:
pred_probs.shape

(2138, 3)

In [33]:
self_confidences = []
for i in range(pred_probs.shape[0]):
    self_confidences.append(pred_probs[i, labels[i]])
self_confidences = np.array(self_confidences)

In [34]:
ranked_indices = np.argsort(self_confidences)
issue_idx = ranked_indices[:num_label_issues]
len(issue_idx)

497

In [35]:
dataset_c.iloc[ranked_indices[:5]]

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
1529,60198412,Cc1cc(/C=C/C(=O)Nc2ccccc2N)on1,15.0,Inactive,Inactive,2
1530,137224531,O=Nc1c(O)n(Cc2ccc(C(=O)NO)cc2)c2ccccc12,30.0,Inactive,Inactive,1
1535,155565698,COC(=O)c1ccc2ccccc2c1OCCCCCCCCS,0.0,Inactive,Active,1
1933,136985286,CCCc1nn(C)c2c(=O)[nH]c(-c3cc(Cc4ccc(CC(=O)NO)c...,,Unspecified,Inactive,2
1930,136645444,CCCc1nn(C)c2c(=O)[nH]c(-c3cc(-c4ccc(CC(=O)NO)c...,,Unspecified,Inactive,2


### 3.7. Clean the data

In [36]:
clean_dataset = dataset.drop(issue_idx)
clean_dataset = clean_dataset.reset_index()
len(clean_dataset)

1641

In [37]:
clean_dataset.head()

Unnamed: 0,index,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,0,2746,CC(=O)Nc1ccc(C(=O)Nc2ccccc2N)cc1,2.23225,0,Inactive,12
1,1,3812,CN(C)c1ccc(C(=O)NCCCCCCCC(=O)NO)cc1,1.308333,0,Inactive,3
2,2,3994,CN(C)c1ccc(C(=O)NCCCCCCC(=O)NO)cc1,1.255333,0,Inactive,3
3,3,4261,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,1.526116,0,Inactive,19
4,4,5173,O=C(CCCCCCC(=O)NO)NO,8.23,0,Inactive,1


In [38]:
print(len(clean_dataset.loc[clean_dataset['FIRST_LABEL'] == 0])) #Active
print(len(clean_dataset.loc[clean_dataset['FIRST_LABEL'] == 1])) #Inactive
print(len(clean_dataset.loc[clean_dataset['FIRST_LABEL'] == 2])) #Unspecified

1346
5
290


In [39]:
clean_labels = clean_dataset['FIRST_LABEL']
clean_data = maccs_fpts(clean_dataset.SMILES)
clean_data = pd.DataFrame(data=clean_data)

In [40]:
print(clean_data.shape)
print(clean_labels.shape)

(1641, 167)
(1641,)


# 4. Training with clean data

## 4.1. Train-test split

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(clean_data, clean_labels, test_size=0.3, random_state=1)

In [64]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1148, 167)
(493, 167)
(1148,)
(493,)


In [65]:
print(len(y_train.loc[y_train == 0]))
print(len(y_train.loc[y_train == 1]))
print(len(y_train.loc[y_train == 2]))

947
3
198


In [66]:
print(len(y_test.loc[y_test == 0]))
print(len(y_test.loc[y_test == 1]))
print(len(y_test.loc[y_test == 2]))

399
2
92


# 4.2. Balance the data

In [60]:
print(len(clean_dataset.loc[clean_dataset['FIRST_LABEL'] == 0])) #Active
print(len(clean_dataset.loc[clean_dataset['FIRST_LABEL'] == 1])) #Inactive
print(len(clean_dataset.loc[clean_dataset['FIRST_LABEL'] == 2])) #Unspecified

1346
5
290


In [69]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1, k_neighbors=2)
X_balanced, y_balanced = smote.fit_resample(X_train, y_train)

In [72]:
print(len(y_balanced[y_balanced == 0])) #Active
print(len(y_balanced[y_balanced == 1])) #Inactive
print(len(y_balanced[y_balanced == 2])) #Unspecified

947
947
947


In [73]:
print(X_balanced.shape)
print(y_balanced.shape)

(2841, 167)
(2841,)


## 4.3. Training models

In [45]:
from sklearn.neighbors import KNeighborsClassifier
knn_maccs = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_maccs.fit(X_train, y_train)

In [46]:
from sklearn.ensemble import RandomForestClassifier
rf_maccs = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
rf_maccs.fit(X_train, y_train)

In [47]:
from sklearn.svm import SVC
svm_maccs = SVC(kernel='rbf', probability=True, random_state=0)
svm_maccs.fit(X_train, y_train)

In [48]:
from xgboost import XGBClassifier
bst_maccs = XGBClassifier(n_estimators=100, objective='binary:logistic')
bst_maccs.fit(X_train, y_train)

## 4.3. Model evaluation

### 4.3.1. Accuracy, Sensitivity, Specificity

In [49]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [50]:
X_Total = np.concatenate((X_train, X_test), axis=0)
y_Total = np.concatenate((y_train, y_test), axis=0)

#KNN
cv = KFold(n_splits=10, random_state=1, shuffle=True)
knn_scores = cross_val_score(knn_maccs, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation KNN: %.3f (%.3f)' % (knn_scores.mean(), knn_scores.std()))

#Random forest
cv = KFold(n_splits=10, random_state=1, shuffle=True)
rf_scores = cross_val_score(rf_maccs, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation RF: %.3f (%.3f)' % (rf_scores.mean(), rf_scores.std()))

#SVM
cv = KFold(n_splits=10, random_state=1, shuffle=True)
svm_scores = cross_val_score(svm_maccs, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation SVM: %.3f (%.3f)' % (svm_scores.mean(), svm_scores.std()))

#xg_boost
cv = KFold(n_splits=10, random_state=1, shuffle=True)
bst_scores = cross_val_score(bst_maccs, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation XG_Boost: %.3f (%.3f)' % (bst_scores.mean(), bst_scores.std()))

Độ chính xác của 10-fold cross validation KNN: 0.939 (0.023)
Độ chính xác của 10-fold cross validation RF: 0.949 (0.024)
Độ chính xác của 10-fold cross validation SVM: 0.931 (0.017)
Độ chính xác của 10-fold cross validation XG_Boost: 0.965 (0.018)


# Binary classification evaluation

In [73]:
from sklearn.metrics import confusion_matrix, accuracy_score
from tabulate import tabulate
import math

def model_evaluation_calculation(cm):
    tp = cm[0][0]; tn = cm[1][1]; fp = cm[0][1]; fn = cm[1][0]
    ac = (tp+tn)/(tp+tn+fp+fn)
    se = tp/(tp+fn)
    sp = tn/(tn+fp)
    mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    return ac, se, sp, mcc

def me_result(cm, model_name):
    cm_string = "Confusion matrix of " + model_name
    print(cm_string)
    print(cm)
    ac, se, sp, mcc = model_evaluation_calculation(cm)
    print("Comparision:")
    table = [[' ' 'AC', 'SE', 'SP', 'MCC'], [model_name, ac, se, sp, mcc]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [74]:
#KNN
y_knn_pred = knn_maccs.predict(X_test)
knn_cm = confusion_matrix(y_test, y_knn_pred)
me_result(knn_cm, model_name="KNN")

#Random Forest
y_rf_pred = rf_maccs.predict(X_test)
rf_cm = confusion_matrix(y_test, y_rf_pred)
me_result(rf_cm, model_name="Random forest")

#SVM
y_svm_pred = svm_maccs.predict(X_test)
svm_cm = confusion_matrix(y_test, y_svm_pred)
me_result(svm_cm, model_name="SVM")

#XG Boost
y_bst_pred = bst_maccs.predict(X_test)
bst_cm = confusion_matrix(y_test, y_bst_pred)
me_result(bst_cm, model_name="XG Boost")

Confusion matrix of KNN
[[190   0]
 [  0   1]]
Comparision:
╒═════╤═══════╤══════╤══════╤═══════╕
│     │    AC │   SE │   SP │   MCC │
╞═════╪═══════╪══════╪══════╪═══════╡
│ KNN │     1 │    1 │    1 │     1 │
╘═════╧═══════╧══════╧══════╧═══════╛
Confusion matrix of Random forest
[[190   0]
 [  0   1]]
Comparision:
╒═══════════════╤═══════╤══════╤══════╤═══════╕
│               │    AC │   SE │   SP │   MCC │
╞═══════════════╪═══════╪══════╪══════╪═══════╡
│ Random forest │     1 │    1 │    1 │     1 │
╘═══════════════╧═══════╧══════╧══════╧═══════╛
Confusion matrix of SVM
[[190   0]
 [  1   0]]
Comparision:
╒═════╤══════════╤══════════╤══════╤═══════╕
│     │       AC │       SE │   SP │   MCC │
╞═════╪══════════╪══════════╪══════╪═══════╡
│ SVM │ 0.994764 │ 0.994764 │  nan │   nan │
╘═════╧══════════╧══════════╧══════╧═══════╛
Confusion matrix of XG Boost
[[190   0]
 [  0   1]]
Comparision:
╒══════════╤═══════╤══════╤══════╤═══════╕
│          │    AC │   SE │   SP │   MCC │
╞═══

  sp = tn/(tn+fp)
  mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


### 4.3.1. AUC

In [75]:
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score

knn_y_proba = knn_maccs.predict_proba(X_test)[:, 1]
rf_y_proba = rf_maccs.predict_proba(X_test)[:, 1]
svm_y_proba = svm_maccs.predict_proba(X_test)[:, 1]
bst_y_proba = bst_maccs.predict_proba(X_test)[:, 1]

knn_auc_score = roc_auc_score(y_test, knn_y_proba)
rf_auc_score = roc_auc_score(y_test, rf_y_proba)
svm_auc_score = roc_auc_score(y_test, svm_y_proba)
bst_auc_score = roc_auc_score(y_test, bst_y_proba)
print(knn_auc_score, rf_auc_score, svm_auc_score, bst_auc_score)

1.0 1.0 1.0 1.0


# Multiple class classification evaluation

In [52]:
from sklearn.metrics import confusion_matrix, accuracy_score
from tabulate import tabulate
import math

In [53]:
#KNN
y_knn_pred = knn_maccs.predict(X_test)
knn_cm = confusion_matrix(y_test, y_knn_pred)

#Random Forest
y_rf_pred = rf_maccs.predict(X_test)
rf_cm = confusion_matrix(y_test, y_rf_pred)

#SVM
y_svm_pred = svm_maccs.predict(X_test)
svm_cm = confusion_matrix(y_test, y_svm_pred)

#XG Boost
y_bst_pred = bst_maccs.predict(X_test)
bst_cm = confusion_matrix(y_test, y_bst_pred)

__Confusion matrix__

In [54]:
print("KNN confusion matrix: ")
print(knn_cm)
print("RF confusion matrix: ")
print(rf_cm)
print("SVM confusion matrix: ")
print(svm_cm)
print("XgBoost confusion matrix: ")
print(bst_cm)

KNN confusion matrix: 
[[391   0   8]
 [  2   0   0]
 [ 25   0  67]]
RF confusion matrix: 
[[396   0   3]
 [  1   1   0]
 [ 14   0  78]]
SVM confusion matrix: 
[[398   0   1]
 [  1   0   1]
 [ 39   0  53]]
XgBoost confusion matrix: 
[[397   0   2]
 [  0   1   1]
 [  9   0  83]]


__Precision__

In [74]:
from sklearn.metrics import precision_score
knn_precision = precision_score(y_true=y_test, y_pred=y_knn_pred, average='macro')
rf_precision = precision_score(y_true=y_test, y_pred=y_rf_pred, average='macro')
svm_precision = precision_score(y_true=y_test, y_pred=y_svm_pred, average='macro')
bst_precision = precision_score(y_true=y_test, y_pred=y_bst_pred, average='macro')
table = [['KNN precision', 'RF precision', 'SVM precision', 'XgBoost precision'], [knn_precision, rf_precision, svm_precision, bst_precision]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒═════════════════╤════════════════╤═════════════════╤═════════════════════╕
│   KNN precision │   RF precision │   SVM precision │   XgBoost precision │
╞═════════════════╪════════════════╪═════════════════╪═════════════════════╡
│         0.60958 │       0.975489 │        0.624104 │            0.980983 │
╘═════════════════╧════════════════╧═════════════════╧═════════════════════╛


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


__Recall score__

In [75]:
from sklearn.metrics import recall_score
knn_recall = recall_score(y_true=y_test, y_pred=y_knn_pred, average='macro')
rf_recall = recall_score(y_true=y_test, y_pred=y_rf_pred, average='macro')
svm_recall = recall_score(y_true=y_test, y_pred=y_svm_pred, average='macro')
bst_recall = recall_score(y_true=y_test, y_pred=y_bst_pred, average='macro')
table = [['KNN recall', 'RF recall', 'SVM recall', 'XgBoost recall'], [knn_recall, rf_recall, svm_recall, bst_recall]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒══════════════╤═════════════╤══════════════╤══════════════════╕
│   KNN recall │   RF recall │   SVM recall │   XgBoost recall │
╞══════════════╪═════════════╪══════════════╪══════════════════╡
│     0.569404 │    0.780102 │     0.524527 │         0.799054 │
╘══════════════╧═════════════╧══════════════╧══════════════════╛


__F1 Score__

In [76]:
from sklearn.metrics import f1_score
knn_f1 = f1_score(y_true=y_test, y_pred=y_knn_pred, average='macro')
rf_f1 = f1_score(y_true=y_test, y_pred=y_rf_pred, average='macro')
svm_f1 = f1_score(y_true=y_test, y_pred=y_svm_pred, average='macro')
bst_f1 = f1_score(y_true=y_test, y_pred=y_bst_pred, average='macro')
table = [['KNN f1', 'RF f1', 'SVM f1', 'XgBoost f1'], [knn_f1, rf_f1, svm_f1, bst_f1]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒══════════╤══════════╤══════════╤══════════════╕
│   KNN f1 │    RF f1 │   SVM f1 │   XgBoost f1 │
╞══════════╪══════════╪══════════╪══════════════╡
│ 0.586519 │ 0.848726 │ 0.557368 │     0.861862 │
╘══════════╧══════════╧══════════╧══════════════╛


__AUC__

In [86]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

knn_y_pred_proba = knn_maccs.predict_proba(X_test)
rf_y_pred_proba = rf_maccs.predict_proba(X_test)
svm_y_pred_proba = svm_maccs.predict_proba(X_test)
bst_y_pred_proba = bst_maccs.predict_proba(X_test)

# Assuming your target variable has more than two classes
# Binarize the target variable
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])  # Update classes accordingly

# Calculate the ROC AUC score for each class
knn_roc_auc_scores = []
rf_roc_auc_scores = []
svm_roc_auc_scores = []
bst_roc_auc_scores = []

for i in range(y_test_bin.shape[1]):
    knn_roc_auc = roc_auc_score(y_test_bin[:, i], knn_y_pred_proba[:, i])
    rf_roc_auc = roc_auc_score(y_test_bin[:, i], rf_y_pred_proba[:, i])
    svm_roc_auc = roc_auc_score(y_test_bin[:, i], svm_y_pred_proba[:, i])
    bst_roc_auc = roc_auc_score(y_test_bin[:, i], bst_y_pred_proba[:, i])

    knn_roc_auc_scores.append(knn_roc_auc)
    rf_roc_auc_scores.append(rf_roc_auc)
    svm_roc_auc_scores.append(svm_roc_auc)
    bst_roc_auc_scores.append(bst_roc_auc)

auc_table = [['KNN AUC score', 'RF AUC score', 'SVM AUC score', 'XgBoost AUC score'], [knn_roc_auc_scores, rf_roc_auc_scores, svm_roc_auc_scores, bst_roc_auc_scores]]
print(tabulate(auc_table, headers='firstrow', tablefmt='fancy_grid'))

╒═══════════════════════════════════════════════╤═══════════════════════════════════════════════╤══════════════════════════════════════════════════════════════╤═══════════════════════════════════════════════╕
│ KNN AUC score                                 │ RF AUC score                                  │ SVM AUC score                                                │ XgBoost AUC score                             │
╞═══════════════════════════════════════════════╪═══════════════════════════════════════════════╪══════════════════════════════════════════════════════════════╪═══════════════════════════════════════════════╡
│ [0.9548738868447715, 1.0, 0.9545700964978857] │ [0.9845358076041166, 1.0, 0.9843868589396074] │ [0.9545139444355569, 0.8665987780040734, 0.9514799956630163] │ [0.9872820348744201, 1.0, 0.9867721999349451] │
╘═══════════════════════════════════════════════╧═══════════════════════════════════════════════╧══════════════════════════════════════════════════════════════╧════