In [91]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

# Read the data

In [92]:
data_version_1_path = "../data_for_modeling/raw_data/v1/Original Data - v1 - Merge.xlsx"
dataset = pd.read_excel(data_version_1_path, sheet_name='original_data')
dataset_length = len(dataset)
dataset_length

993

# Profile of the data

## 1. Group by original activity

In [44]:
def draw_table_for_activity_counts(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "Active"]
    inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]
    inconclusive_rows = dataset.loc[dataset[col_name] == "Inconclusive"]
    unspecified_rows = dataset.loc[dataset[col_name] == "Unspecified"]

    print("Total dataset")
    table = [['', 'Active', 'Inactive', 'Inconclusive', 'Unspecified'], 
            ['Number', len(active_rows), len(inactive_rows), len(inconclusive_rows), len(unspecified_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100,
            len(inconclusive_rows)/dataset_length*100, len(unspecified_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [45]:
draw_table_for_activity_counts(dataset=dataset, col_name='FIRST_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 769      │   10       │              0 │      214      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  77.4421 │    1.00705 │              0 │       21.5509 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


## 2. Group by the new activity

In [46]:
draw_table_for_activity_counts(dataset=dataset, col_name='ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 501      │   492      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  50.4532 │    49.5468 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


# Filtering the data

## 1. Cannonical Smiles and remove duplicates

In [47]:
def make_canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [48]:
cannon_smiles = make_canonical_smiles(dataset.SMILES)
dataset['SMILES'] = cannon_smiles
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicates_smiles

array(['CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1',
       'CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1',
       'O=C1CCC=CCCC(=O)N(Cc2cccc(C=NNC(=O)c3cccc(C(=O)NO)c3)c2)CC(c2ccccc2)O1',
       'CC(C)C1NC(=O)C2(C)CSC(=N2)c2csc(n2)CNC(=O)CC(C(F)=CCCS)OC1=O',
       'O=C(CCC(CCCC(=O)Nc1ccccc1)Cc1ccccc1)NO',
       'O=C(CCC(CCCC(=O)Nc1ccccc1)Cc1ccccc1)NO',
       'CCCCCCC(CCCCCC(=O)Nc1ccccc1)C(=O)NO',
       'COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1',
       'COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1'], dtype=object)

In [49]:
dup_smiles_idxs = []
for smiles in duplicates_smiles:
    sub_dataset_dup_smiles = dataset[dataset.SMILES == smiles].copy()
    for idx, _ in sub_dataset_dup_smiles.iterrows():
        dup_smiles_idxs.append(idx)
print(dup_smiles_idxs)
print(len(dup_smiles_idxs))

[249, 252, 270, 249, 252, 270, 260, 290, 704, 706, 910, 912, 913, 910, 912, 913, 915, 917, 974, 975, 977, 974, 975, 977]
24


In [50]:
dataset = dataset.drop(dup_smiles_idxs)

In [51]:
#check again
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicates_smiles

array([], dtype=object)

In [52]:
len(dataset)

978

## 2. Check for label intersection

In [53]:
active_rows = dataset.loc[dataset['FIRST_LABEL'] == "Active"]
inactive_rows = dataset.loc[dataset['FIRST_LABEL'] == "Inactive"]
inconclusive_rows = dataset.loc[dataset['FIRST_LABEL'] == "Inconclusive"]
unspecified_rows = dataset.loc[dataset['FIRST_LABEL'] == "Unspecified"]
    
cid_active = active_rows.loc[:, 'CID']
cid_inactive = inactive_rows.loc[:, 'CID']
cid_incon = inconclusive_rows.loc[:, 'CID']
cid_unspec = unspecified_rows.loc[:, 'CID']

ac_inac_cid = np.intersect1d(cid_active, cid_inactive)
ac_incon_cid = np.intersect1d(cid_active, cid_incon)
ac_unspec_cid = np.intersect1d(cid_active, cid_unspec)

inac_incon_cid = np.intersect1d(cid_inactive, cid_incon)
incon_unspec_cid = np.intersect1d(cid_incon, cid_unspec)
inac_unspec_cid = np.intersect1d(cid_inactive, cid_unspec)
print(len(ac_inac_cid), 
      len(ac_incon_cid), 
      len(ac_unspec_cid), 
      len(inac_incon_cid), 
      len(incon_unspec_cid), 
      len(inac_unspec_cid))

0 0 0 0 0 0


## 3. Labels errors

In [54]:
#Encoding labels
dataset.loc[dataset.ACTIVITY == 'Active', 'ACTIVITY'] = 1
dataset.loc[dataset.ACTIVITY == 'Inactive', 'ACTIVITY'] = 0
dataset.head()

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,COC(=O)CCNNC(=O)C=Cc1ccc(CNCCc2c(C)[nH]c3ccccc...,155525662,4.214,Active,0,1
1,1,CSc1ccc2c(c1)N(Cc1ccc(C(=O)NO)cc1)c1ccccc1S2,164629157,0.68,Active,1,1
2,2,O=C(NO)c1ccc(CN2c3ccccc3S(=O)c3ccc(C(F)(F)F)cc...,164627475,2.12,Active,0,1
3,3,CC(C)(C)OC(=O)Nc1ccc(-c2cc(NC(=O)CCCCCCC(=O)NO...,164627446,0.252,Active,1,1
4,4,CCCC[C@H](NC(=O)[C@H](CN)c1c(C)[nH]c2ccc(OC)cc...,164627330,2.00525,Active,0,4


**We will do this in the MACCS keys**

In [55]:
def maccs_fpts(data):
    Maccs_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i)
        fpts = MACCSkeys.GenMACCSKeys(mol)
        mfpts = np.array(fpts)
        Maccs_fpts.append(mfpts)
    return np.array(Maccs_fpts)

In [56]:
smiles = dataset.SMILES
data = maccs_fpts(smiles)
data = pd.DataFrame(data=data)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


In [77]:
labels = dataset['ACTIVITY']
labels = np.array(labels, dtype=int)
labels

array([0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,

### 3.1. Getting out-of-sample predicted probabilities

In [59]:
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

In [62]:
# XGBoost(experimental) supports categorical data.
# Here we use default hyperparameters for simplicity.
# Get out-of-sample predicted probabilities and check model accuracy.
model = XGBClassifier(tree_method="hist", enable_categorical=True)

In [78]:
pred_probs = cross_val_predict(model, data, labels, method='predict_proba')
print(len(pred_probs))
print(pred_probs)

978
[[0.09393412 0.9060659 ]
 [0.9435409  0.05645914]
 [0.01030552 0.9896945 ]
 ...
 [0.9601505  0.03984953]
 [0.8757034  0.12429662]
 [0.9674798  0.03252017]]


In [79]:
model.fit(data, labels)
preds = model.predict(data)
len(preds)

978

In [81]:
accuracy_score(labels, preds) 

0.9703476482617587

### 3.2. Checking model accuracy on original data

Now that we have out-of-sample predicted probabilities, we can also check the model's (cross-val) accuracy on the original (noisy) data, so we'll have a baseline to compare our final results.

In [85]:
print(labels)

[0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1
 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0
 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0
 1 0 1 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0
 0 0 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
 0 1 1 1 1 1 0 1 0 0 1 1 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1
 1 1 0 0 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 0 

In [86]:
preds = np.argmax(pred_probs, axis=1)
preds = np.array(preds, dtype=int)
print(preds)

[1 0 1 1 1 1 0 1 1 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 0 1 0 1 0 1 0 0 0
 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 0 1 1 0 1 0
 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1
 1 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0 1 0 1
 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0
 1 0 0 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 0 0 0 0 1 0 1 1 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1
 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [87]:
acc_original = accuracy_score(preds, labels)
print(f"Accuracy with original data: {round(acc_original*100,1)}%")

Accuracy with original data: 63.5%


## 3.3. Finding label issues automatically

In [24]:
n_examples, n_classes = pred_probs.shape
print(n_examples, n_classes)

978 2


In [25]:
def compute_class_thresholds(pred_probs: np.ndarray, labels: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    thresholds = np.zeros(n_classes)
    for k in range(n_classes):
        count = 0
        p_sum = 0
        for i in range(n_examples):
            if labels[i] == k: #this explain the p^(y~=j;x,theta), the noisy label is equal class k
                count += 1
                p_sum += pred_probs[i, k]
        thresholds[k] = p_sum / count
    return thresholds

In [88]:
# should be a numpy array of length 5
thresholds = compute_class_thresholds(pred_probs, labels)
thresholds

array([0.61931538, 0.61794919])

## 3.4. Constructing the confident joint

In [89]:
def compute_confident_joint(pred_probs: np.ndarray, labels: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    confident_joint = np.zeros((n_classes, n_classes), dtype=np.int64)
    for data_idx in range(n_examples):
        i = labels[data_idx]    #y_noise
        j = None                #y_true -> to find
        #Lưu ý điểm mình bị sai: vị trí của chúng không ứng với label
        p_j = -1
        for candidate_j in range(n_classes):
            p = pred_probs[data_idx, candidate_j]
            if p >= thresholds[candidate_j] and p > p_j:
                j = candidate_j
                p_j = p
        if j is not None:
            confident_joint[i][j] += 1
    return confident_joint

In [90]:
C = compute_confident_joint(pred_probs, labels.to_numpy(), thresholds)
C

AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'