In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

# 1. Read the data

In [2]:
data_version_1_path = "../data_for_modeling/raw_data/v1/Original Data - v1 - Merge.xlsx"
dataset = pd.read_excel(data_version_1_path, sheet_name='original_data')
dataset_length = len(dataset)
dataset_length

993

In [3]:
dataset.head()

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,CSC1=CC2=C(C=C1)SC3=CC=CC=C3N2CC4=CC=C(C=C4)C(...,164629157,0.68,Active,Active,1
1,1,CC1=C(C2=CC=CC=C2N1)CCNCC3=CC=C(C=C3)C=CC(=O)N...,155525662,4.214,Active,Inactive,1
2,2,C1=CC=C2C(=C1)N(C3=C(S2=O)C=CC(=C3)C(F)(F)F)CC...,164627475,2.12,Active,Inactive,1
3,3,CC(C)(C)OC(=O)NC1=CC=C(C=C1)C2=CC(=NO2)NC(=O)C...,164627446,0.252,Active,Active,1
4,4,CCCC[C@@H](C1=NC=C(N1)C2=CC3=CC=CC=C3N=C2OC)NC...,164627330,2.00525,Active,Inactive,4


# 2. Profile of the data

## 2.1. Group by original activity

In [4]:
def check_activity_distribution(dataset, col_name, encode):
    dataset_length = len(dataset)
    if not encode:
        active_rows = dataset.loc[dataset[col_name] == "Active"]
        inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]
        inconclusive_rows = dataset.loc[dataset[col_name] == "Inconclusive"]
        unspecified_rows = dataset.loc[dataset[col_name] == "Unspecified"]
        
        print("Total dataset")
        table = [['', 'Active', 'Inactive', 'Inconclusive', 'Unspecified'], 
                ['Number', len(active_rows), len(inactive_rows), len(inconclusive_rows), len(unspecified_rows)],
                ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100,
                len(inconclusive_rows)/dataset_length*100, len(unspecified_rows)/dataset_length*100]]
        print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
    else:
        active_rows = dataset.loc[dataset[col_name] == 0]
        inactive_rows = dataset.loc[dataset[col_name] == 1]
        print("Total dataset")
        table = [['', 'Active', 'Inactive'], 
                ['Number', len(active_rows), len(inactive_rows)],
                ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100]]
        print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
    

In [5]:
check_activity_distribution(dataset=dataset, col_name='FIRST_LABEL', encode=False)

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 769      │   10       │              0 │      214      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  77.4421 │    1.00705 │              0 │       21.5509 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


# 3. Filtering the data

## 3.1. Cannonical Smiles and remove duplicates

In [6]:
def make_canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [7]:
cannon_smiles = make_canonical_smiles(dataset.SMILES)
dataset['SMILES'] = cannon_smiles
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicate_index = dataset[dataset['SMILES'].duplicated()]['SMILES'].index
duplicates_smiles

array(['CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1',
       'CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1',
       'O=C1CCC=CCCC(=O)N(Cc2cccc(C=NNC(=O)c3cccc(C(=O)NO)c3)c2)CC(c2ccccc2)O1',
       'CC(C)C1NC(=O)C2(C)CSC(=N2)c2csc(n2)CNC(=O)CC(C(F)=CCCS)OC1=O',
       'O=C(CCC(CCCC(=O)Nc1ccccc1)Cc1ccccc1)NO',
       'O=C(CCC(CCCC(=O)Nc1ccccc1)Cc1ccccc1)NO',
       'CCCCCCC(CCCCCC(=O)Nc1ccccc1)C(=O)NO',
       'COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1',
       'COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1'], dtype=object)

In [9]:
duplicate_index

Int64Index([252, 270, 290, 706, 912, 913, 917, 975, 977], dtype='int64')

In [11]:
dup_smiles_idxs = []
for smiles in duplicates_smiles:
    sub_dataset_dup_smiles = dataset[dataset.SMILES == smiles].copy()
    for idx, _ in sub_dataset_dup_smiles.iterrows():
        dup_smiles_idxs.append(idx)
print(dup_smiles_idxs)
print(len(dup_smiles_idxs))

[249, 252, 270, 249, 252, 270, 260, 290, 704, 706, 910, 912, 913, 910, 912, 913, 915, 917, 974, 975, 977, 974, 975, 977]
24


In [12]:
dataset = dataset.drop(dup_smiles_idxs)

In [13]:
#check again
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicates_smiles

array([], dtype=object)

In [14]:
dataset = dataset.reset_index(drop=True)
len(dataset)

978

In [15]:
check_activity_distribution(dataset, 'FIRST_LABEL', encode=False)

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 762      │   10       │              0 │      206      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  77.9141 │    1.02249 │              0 │       21.0634 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


## 3.2. Delete unspecified data

In [16]:
unspecified_rows_idx = dataset.loc[dataset['FIRST_LABEL'] == "Unspecified"].index
dataset = dataset.drop(unspecified_rows_idx)
dataset = dataset.reset_index(drop=True)
len(dataset)

In [19]:
check_activity_distribution(dataset=dataset, col_name='FIRST_LABEL', encode=False)

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 762      │   10       │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  98.7047 │    1.29534 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


# 3. Find labels errors on the data

In [21]:
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

#Encoding labels
dataset_c = dataset.copy()
# Transform letter grades and notes to categorical numbers.
# Necessary for XGBoost.
dataset['FIRST_LABEL'] = preprocessing.LabelEncoder().fit_transform(dataset['FIRST_LABEL'])
dataset.head()

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,CSc1ccc2c(c1)N(Cc1ccc(C(=O)NO)cc1)c1ccccc1S2,164629157,0.68,0,Active,1
1,1,COC(=O)CCNNC(=O)C=Cc1ccc(CNCCc2c(C)[nH]c3ccccc...,155525662,4.214,0,Inactive,1
2,2,O=C(NO)c1ccc(CN2c3ccccc3S(=O)c3ccc(C(F)(F)F)cc...,164627475,2.12,0,Inactive,1
3,3,CC(C)(C)OC(=O)Nc1ccc(-c2cc(NC(=O)CCCCCCC(=O)NO...,164627446,0.252,0,Active,1
4,4,CCCC[C@H](NC(=O)[C@H](CN)c1c(C)[nH]c2ccc(OC)cc...,164627330,2.00525,0,Inactive,4


In [22]:
dataset_c.head()

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,CSc1ccc2c(c1)N(Cc1ccc(C(=O)NO)cc1)c1ccccc1S2,164629157,0.68,Active,Active,1
1,1,COC(=O)CCNNC(=O)C=Cc1ccc(CNCCc2c(C)[nH]c3ccccc...,155525662,4.214,Active,Inactive,1
2,2,O=C(NO)c1ccc(CN2c3ccccc3S(=O)c3ccc(C(F)(F)F)cc...,164627475,2.12,Active,Inactive,1
3,3,CC(C)(C)OC(=O)Nc1ccc(-c2cc(NC(=O)CCCCCCC(=O)NO...,164627446,0.252,Active,Active,1
4,4,CCCC[C@H](NC(=O)[C@H](CN)c1c(C)[nH]c2ccc(OC)cc...,164627330,2.00525,Active,Inactive,4


**We will do this in the MACCS keys**

In [23]:
def maccs_fpts(data):
    Maccs_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i)
        fpts = MACCSkeys.GenMACCSKeys(mol)
        mfpts = np.array(fpts)
        Maccs_fpts.append(mfpts)
    return np.array(Maccs_fpts)

In [24]:
smiles = dataset.SMILES
data = maccs_fpts(smiles)
data = pd.DataFrame(data=data)
labels = dataset['FIRST_LABEL']
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


### 3.1. Getting out-of-sample predicted probabilities

In [25]:
model = XGBClassifier(tree_method="hist", enable_categorical=True)
pred_probs = cross_val_predict(model, data, labels, method='predict_proba')
print(len(pred_probs))
print(pred_probs)

772
[[9.98952389e-01 1.04763755e-03]
 [9.98971462e-01 1.02851458e-03]
 [9.99838233e-01 1.61791220e-04]
 ...
 [1.08249784e-01 8.91750216e-01]
 [2.53913224e-01 7.46086776e-01]
 [1.12553835e-01 8.87446165e-01]]


### 3.2. Checking model accuracy on original data

Now that we have out-of-sample predicted probabilities, we can also check the model's (cross-val) accuracy on the original (noisy) data, so we'll have a baseline to compare our final results.

In [26]:
preds = np.argmax(pred_probs, axis=1)
acc_original = accuracy_score(preds, labels)
print(f"Accuracy with original data: {round(acc_original*100,1)}%")

Accuracy with original data: 98.4%


In [27]:
model.fit(data, labels)
preds_by_predict = model.predict(data)
acc_pred_by_predict = accuracy_score(preds_by_predict, labels)
print(f"Accuracy with original data: {round(acc_pred_by_predict*100,1)}%")

Accuracy with original data: 99.6%


### 3.3. Finding the class threshold

In [28]:
def compute_class_thresholds(pred_probs: np.ndarray, labels: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    thresholds = np.zeros(n_classes)
    for k in range(n_classes):
        count = 0
        p_sum = 0
        for i in range(n_examples):
            if labels[i] == k:
                count += 1
                p_sum += pred_probs[i, k]
        thresholds[k] = p_sum / count
    return thresholds

<b>Check the data and its label was right</b>

In [29]:
print(dataset_c.loc[249]['FIRST_LABEL'])
print("label: " + str(labels.to_numpy()[249]))

Active
label: 0


In [30]:
# should be a numpy array of length 5
thresholds = compute_class_thresholds(pred_probs, labels.to_numpy())
thresholds

array([0.98759236, 0.59898184])

### 3.4. Constructing the confident joint

In [37]:
def compute_confident_joint(pred_probs: np.ndarray, labels: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    confident_joint = np.zeros((n_classes, n_classes), dtype=np.int64)
    positions = np.array([[-1, -1]])
    for data_idx in range(n_examples):
        i = labels[data_idx]    #y_noise
        j = None                #y_true -> to find
        #Lưu ý điểm mình bị sai: vị trí của chúng không ứng với label
        p_j = -1
        for candidate_j in range(n_classes):
            p = pred_probs[data_idx, candidate_j]
            if p >= thresholds[candidate_j] and p > p_j:
                j = candidate_j
                p_j = p
        if j is not None:
            confident_joint[i][j] += 1
        positions = np.append(positions, np.array([[i, j]]), axis=0)
    return confident_joint, positions

In [38]:
C, _ = compute_confident_joint(pred_probs, labels.to_numpy(), thresholds)
print(C)
# print(positions)

[[696   8]
 [  1   7]]


In [31]:
# positions = pd.DataFrame(positions, columns=["i", "j"]).to_excel("../data_for_modeling/filter_data/v1/before_clean_data/positions.xlsx")

### 3.5 Count the number of label issues

In [39]:
num_label_issues = C.sum() - C.trace()
num_label_issues

9

In [40]:
print('Estimated noise rate: {:.1f}%'.format(100*num_label_issues / pred_probs.shape[0]))

Estimated noise rate: 1.2%


### 3.6. Filter out label issues

In [41]:
pred_probs.shape

(772, 2)

In [42]:
self_confidences = []
for i in range(pred_probs.shape[0]):
    self_confidences.append(pred_probs[i, labels[i]])
self_confidences = np.array(self_confidences)

In [43]:
ranked_indices = np.argsort(self_confidences)
issue_idx = ranked_indices[:num_label_issues]
len(issue_idx)

9

In [44]:
issue_idx

array([766, 762, 763, 192, 219, 214, 221, 224, 211])

In [45]:
dataset_c.iloc[ranked_indices[:5]]

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
766,773,COC(=O)c1ccc2ccccc2c1OCCCCCCCCS,155565698,0.0,Inactive,Inactive,1
762,769,O=C(/C=C/c1ccc(S(=O)(=O)NCCCN2c3ccccc3CCc3ccc(...,156021102,0.0,Inactive,Inactive,1
763,770,O=C(CCC(=O)NCCCN1c2ccccc2CCc2ccc(Cl)cc21)NO,156018870,0.0,Inactive,Inactive,1
192,192,O=C(/C=C/c1cccc(S(=O)(=O)NCCCN2c3ccccc3CCc3ccc...,156022065,8.48,Active,Inactive,1
219,219,O=C(/C=C/c1ccc(CNCCCN2c3ccccc3CCc3ccc(Cl)cc32)...,156014407,6.62,Active,Inactive,1


In [47]:
dataset_c.loc[issue_idx[0]]

STT                                             773
SMILES              COC(=O)c1ccc2ccccc2c1OCCCCCCCCS
CID                                       155565698
IC50 (uM)                                       0.0
FIRST_LABEL                                Inactive
ACTIVITY                                   Inactive
DUPLICATE_COUNTS                                  1
Name: 766, dtype: object

In [48]:
dataset.loc[249]

STT                                                       250
SMILES              O=C(NO)c1ccc(Cn2cc(CN3CCCC3)c3ccccc32)cc1
CID                                                 155564870
IC50 (uM)                                             6.30957
FIRST_LABEL                                                 0
ACTIVITY                                             Inactive
DUPLICATE_COUNTS                                            1
Name: 249, dtype: object

### 3.7. Clean the data

In [60]:
clean_dataset = dataset.drop(issue_idx)
clean_dataset = clean_dataset.reset_index()
len(clean_dataset)

763

In [61]:
clean_dataset.head()

Unnamed: 0,index,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,0,CSc1ccc2c(c1)N(Cc1ccc(C(=O)NO)cc1)c1ccccc1S2,164629157,0.68,0,Active,1
1,1,1,COC(=O)CCNNC(=O)C=Cc1ccc(CNCCc2c(C)[nH]c3ccccc...,155525662,4.214,0,Inactive,1
2,2,2,O=C(NO)c1ccc(CN2c3ccccc3S(=O)c3ccc(C(F)(F)F)cc...,164627475,2.12,0,Inactive,1
3,3,3,CC(C)(C)OC(=O)Nc1ccc(-c2cc(NC(=O)CCCCCCC(=O)NO...,164627446,0.252,0,Active,1
4,4,4,CCCC[C@H](NC(=O)[C@H](CN)c1c(C)[nH]c2ccc(OC)cc...,164627330,2.00525,0,Inactive,4


In [59]:
print(len(clean_dataset.loc[clean_dataset['FIRST_LABEL'] == 0]))
print(len(clean_dataset.loc[clean_dataset['FIRST_LABEL'] == 1]))

756
7


In [62]:
clean_labels = clean_dataset['FIRST_LABEL']
clean_data = maccs_fpts(clean_dataset.SMILES)
clean_data = pd.DataFrame(data=clean_data)

In [64]:
print(clean_data.shape)
print(clean_labels.shape)

(763, 167)
(763,)


# 4. Training with clean data

## 4.1. Train-test split

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(clean_data, clean_labels, test_size=0.3, random_state=1)

In [77]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(534, 167)
(229, 167)
(534,)
(229,)


In [85]:
print(len(y_train.loc[y_train == 1]))
print(len(y_train.loc[y_train == 0]))

5
529


In [84]:
print(len(y_test.loc[y_test == 1]))
print(len(y_test.loc[y_test == 0]))

2
227


## 4.2. Training models

In [67]:
from sklearn.neighbors import KNeighborsClassifier
knn_maccs = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_maccs.fit(X_train, y_train)

In [68]:
from sklearn.ensemble import RandomForestClassifier
rf_maccs = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
rf_maccs.fit(X_train, y_train)

In [69]:
from sklearn.svm import SVC
svm_maccs = SVC(kernel='rbf', probability=True, random_state=0)
svm_maccs.fit(X_train, y_train)

In [70]:
from xgboost import XGBClassifier
bst_maccs = XGBClassifier(n_estimators=100, objective='binary:logistic')
bst_maccs.fit(X_train, y_train)

## 4.3. Model evaluation

### 4.3.1. Accuracy, Sensitivity, Specificity

In [71]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [72]:
X_Total = np.concatenate((X_train, X_test), axis=0)
y_Total = np.concatenate((y_train, y_test), axis=0)

#KNN
cv = KFold(n_splits=10, random_state=1, shuffle=True)
knn_scores = cross_val_score(knn_maccs, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation KNN: %.3f (%.3f)' % (knn_scores.mean(), knn_scores.std()))

#Random forest
cv = KFold(n_splits=10, random_state=1, shuffle=True)
rf_scores = cross_val_score(rf_maccs, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation RF: %.3f (%.3f)' % (rf_scores.mean(), rf_scores.std()))

#SVM
cv = KFold(n_splits=10, random_state=1, shuffle=True)
svm_scores = cross_val_score(svm_maccs, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation SVM: %.3f (%.3f)' % (svm_scores.mean(), svm_scores.std()))

#xg_boost
cv = KFold(n_splits=10, random_state=1, shuffle=True)
bst_scores = cross_val_score(bst_maccs, X_Total, y_Total, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation XG_Boost: %.3f (%.3f)' % (bst_scores.mean(), bst_scores.std()))

Độ chính xác của 10-fold cross validation KNN: 0.996 (0.006)
Độ chính xác của 10-fold cross validation RF: 0.996 (0.006)
Độ chính xác của 10-fold cross validation SVM: 0.991 (0.008)
Độ chính xác của 10-fold cross validation XG_Boost: 0.996 (0.006)


In [73]:
from sklearn.metrics import confusion_matrix, accuracy_score
from tabulate import tabulate
import math

def model_evaluation_calculation(cm):
    tp = cm[0][0]; tn = cm[1][1]; fp = cm[0][1]; fn = cm[1][0]
    ac = (tp+tn)/(tp+tn+fp+fn)
    se = tp/(tp+fn)
    sp = tn/(tn+fp)
    mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    return ac, se, sp, mcc

def me_result(cm, model_name):
    cm_string = "Confusion matrix of " + model_name
    print(cm_string)
    print(cm)
    ac, se, sp, mcc = model_evaluation_calculation(cm)
    print("Comparision:")
    table = [[' ' 'AC', 'SE', 'SP', 'MCC'], [model_name, ac, se, sp, mcc]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [74]:
#KNN
y_knn_pred = knn_maccs.predict(X_test)
knn_cm = confusion_matrix(y_test, y_knn_pred)
me_result(knn_cm, model_name="KNN")

#Random Forest
y_rf_pred = rf_maccs.predict(X_test)
rf_cm = confusion_matrix(y_test, y_rf_pred)
me_result(rf_cm, model_name="Random forest")

#SVM
y_svm_pred = svm_maccs.predict(X_test)
svm_cm = confusion_matrix(y_test, y_svm_pred)
me_result(svm_cm, model_name="SVM")

#XG Boost
y_bst_pred = bst_maccs.predict(X_test)
bst_cm = confusion_matrix(y_test, y_bst_pred)
me_result(bst_cm, model_name="XG Boost")

Confusion matrix of KNN
[[190   0]
 [  0   1]]
Comparision:
╒═════╤═══════╤══════╤══════╤═══════╕
│     │    AC │   SE │   SP │   MCC │
╞═════╪═══════╪══════╪══════╪═══════╡
│ KNN │     1 │    1 │    1 │     1 │
╘═════╧═══════╧══════╧══════╧═══════╛
Confusion matrix of Random forest
[[190   0]
 [  0   1]]
Comparision:
╒═══════════════╤═══════╤══════╤══════╤═══════╕
│               │    AC │   SE │   SP │   MCC │
╞═══════════════╪═══════╪══════╪══════╪═══════╡
│ Random forest │     1 │    1 │    1 │     1 │
╘═══════════════╧═══════╧══════╧══════╧═══════╛
Confusion matrix of SVM
[[190   0]
 [  1   0]]
Comparision:
╒═════╤══════════╤══════════╤══════╤═══════╕
│     │       AC │       SE │   SP │   MCC │
╞═════╪══════════╪══════════╪══════╪═══════╡
│ SVM │ 0.994764 │ 0.994764 │  nan │   nan │
╘═════╧══════════╧══════════╧══════╧═══════╛
Confusion matrix of XG Boost
[[190   0]
 [  0   1]]
Comparision:
╒══════════╤═══════╤══════╤══════╤═══════╕
│          │    AC │   SE │   SP │   MCC │
╞═══

  sp = tn/(tn+fp)
  mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))


### 4.3.1. AUC

In [75]:
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score

knn_y_proba = knn_maccs.predict_proba(X_test)[:, 1]
rf_y_proba = rf_maccs.predict_proba(X_test)[:, 1]
svm_y_proba = svm_maccs.predict_proba(X_test)[:, 1]
bst_y_proba = bst_maccs.predict_proba(X_test)[:, 1]

knn_auc_score = roc_auc_score(y_test, knn_y_proba)
rf_auc_score = roc_auc_score(y_test, rf_y_proba)
svm_auc_score = roc_auc_score(y_test, svm_y_proba)
bst_auc_score = roc_auc_score(y_test, bst_y_proba)
print(knn_auc_score, rf_auc_score, svm_auc_score, bst_auc_score)

1.0 1.0 1.0 1.0
