In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.preprocessing import LabelEncoder

from confident_learning import MyConfidentLearning
from sklearn.model_selection import train_test_split

# Import data

In [2]:
train_dataset = pd.read_excel("../../data/train_test_data/NoCL/20240207_pan_HDAC_train_test_data.xlsx", sheet_name="train_dataset")
test_dataset = pd.read_excel("../../data/train_test_data/NoCL/20240207_pan_HDAC_train_test_data.xlsx", sheet_name='test_dataset')
validation_dataset = pd.read_excel("../../data/train_test_data/NoCL/20240207_pan_HDAC_train_test_data.xlsx", sheet_name='validation_dataset')

In [3]:
print(train_dataset.shape, test_dataset.shape, validation_dataset.shape)

(1115, 6) (239, 6) (240, 6)


In [4]:
train_dataset.head()

Unnamed: 0,MOL_ID,SMILES,IC50,IC50 of reference (vorinostat),Bioactivity,Ref_DOI
0,1162325,ONC(c1cnc(N(C[C@H]23)C[C@@H]2[C@H]3Nc2nc3ccccc...,490.4,60.2,inactive,10.1016/j.ejmech.2021.113799
1,136030779,C1=CC=C(C=C1)CN2C=C(N=N2)C3=CC(=CC=C3)C(=O)NO,58.0,107.0,active,10.1021/jm101605z
2,71520630,CC(C)C1=CC=C(C=C1)C(=O)NOCCCCCC(=O)NO,75.1,50.1,inactive,10.1021/acs.jmedchem.1c00821
3,603695,COc1cc2ncnc(Nc3cc(C#C)ccc3)c2cc1OCCCCCC(NO)=O,15.0,83.0,active,10.1021/jm900125m
4,11723098,C1=CC=C(C=C1)CCN2C=CC(=N2)C3=CC=C(S3)C(=O)NO,5000.0,120.0,inactive,10.1016/s0960-894x(02)00622-4


In [5]:
#MACCS
from tqdm import tqdm

def maccs_fpts(data):
    Maccs_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
                fpts = MACCSkeys.GenMACCSKeys(mol)
            except:
                print("An exception occurred with " + str(count))
                continue
            mfpts = np.array(fpts)
            Maccs_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts)

#maccs
def morgan_fpts(data):
    Morgan_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
                fpts = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
            except:
                print("An exception occurred with " + str(count))
                continue
            mfpts = np.array(fpts)
            Morgan_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Morgan_fpts)

In [6]:
#X data
X_Train = morgan_fpts(train_dataset['SMILES'])
X_Test = morgan_fpts(test_dataset['SMILES'])
X_Validation = morgan_fpts(validation_dataset['SMILES'])

Progress:   0%|          | 0/1115 [00:00<?, ?it/s][16:34:15] Conflicting single bond directions around double bond at index 26.
[16:34:15]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:34:15] Conflicting single bond directions around double bond at index 17.
[16:34:15]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:34:15] Conflicting single bond directions around double bond at index 26.
[16:34:15]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:34:15] Conflicting single bond directions around double bond at index 16.
[16:34:15]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:34:15] Conflicting single bond directions around double bond at index 25.
[16:34:15]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:34:15] Conflicting single bond directions around double bond at index 18.
[16:34:15]   BondStereo set to STEREONONE and single bond directions set

In [7]:
print(X_Train.shape, X_Test.shape, X_Validation.shape)

(1115, 1024) (239, 1024) (240, 1024)


In [8]:
#y data
y_Train = np.array(train_dataset['Bioactivity'])
y_Test = np.array(test_dataset['Bioactivity'])
y_Validation = np.array(validation_dataset['Bioactivity'])

#Original data
print("Original data:")
print(y_Train[0:5])
print(y_Test[0:5])
print(y_Validation[0:5])

#One-hot encoder
label_encoder = LabelEncoder()
y_Train = label_encoder.fit_transform(y_Train)
y_Test = label_encoder.transform(y_Test)
y_Validation = label_encoder.transform(y_Validation)
#Class encoded
print("Class encoded:")
print(list(label_encoder.classes_))
print(label_encoder.transform(label_encoder.classes_))
print("Encoded data:")
print(y_Train[0:5])
print(y_Test[0:5])
print(y_Validation[0:5])

Original data:
['inactive' 'active' 'inactive' 'active' 'inactive']
['inactive' 'active' 'inactive' 'inactive' 'inactive']
['active' 'inactive' 'active' 'inactive' 'inactive']
Class encoded:
['active', 'inactive']
[0 1]
Encoded data:
[1 0 1 0 1]
[1 0 1 1 1]
[0 1 0 1 1]


# Confident learning to remove label errors

## Check the models's accuracy with the data

In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
model = XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
original_train_acc = cross_val_score(model, X_Train, y_Train, scoring='accuracy', cv=5)
original_validation_acc = cross_val_score(model, X_Validation, y_Validation, scoring='accuracy', cv=5)
original_test_acc = cross_val_score(model, X_Test, y_Test, scoring='accuracy', cv=5)

In [10]:
print(np.mean(original_train_acc), np.mean(original_validation_acc), np.mean(original_test_acc))

0.5551569506726457 0.5583333333333333 0.5023049645390072


## Approach 1 - Method 2: Estimate the label errors

### Find and pruning label errors using vanilla CL

In [11]:
train_cl = MyConfidentLearning(X=X_Train, y=y_Train)
train_pred_probs = train_cl.get_out_of_sample_proba()

validation_cl = MyConfidentLearning(X=X_Validation, y=y_Validation)
validation_pred_probs = validation_cl.get_out_of_sample_proba()

test_cl = MyConfidentLearning(X=X_Test, y=y_Test)
test_pred_probs = test_cl.get_out_of_sample_proba()

[+] Getting out of sample probality
[-] Finished getting out of sample probality with shape: (1115, 2)
[+] Getting out of sample probality
[-] Finished getting out of sample probality with shape: (240, 2)
[+] Getting out of sample probality
[-] Finished getting out of sample probality with shape: (239, 2)


__Class threshold__

In [12]:
# should be a numpy array of length 5
train_thresholds = train_cl.compute_class_thresholds()
validation_thresholds = validation_cl.compute_class_thresholds()
test_thresholds = test_cl.compute_class_thresholds()

[+] Computing thresholds
[-] Finished compute thresholds: [0.35634137 0.65191514]
[+] Computing thresholds
[-] Finished compute thresholds: [0.40895423 0.6395467 ]
[+] Computing thresholds
[-] Finished compute thresholds: [0.30361629 0.62587541]


__Confident joint__

In [13]:
C_train = train_cl.compute_confident_joint()
C_validation = validation_cl.compute_confident_joint()
C_test = test_cl.compute_confident_joint()

[+] Computing confident joint
[-] Finished compute confident joint:
[[179 223]
 [288 418]]
[+] Computing confident joint
[-] Finished compute confident joint:
[[42 49]
 [54 86]]
[+] Computing confident joint
[-] Finished compute confident joint:
[[33 49]
 [67 90]]


__Estimate label errors__

In [14]:
def caculate_noise_rate(C, no_of_samples):
    num_label_issues = np.sum(C - np.diag(np.diag(C)))
    print(f"Number of label issues: {num_label_issues}")
    print('Estimated noise rate: {:.1f}%'.format(100*num_label_issues / no_of_samples))

In [15]:
print("Train dataset:")
caculate_noise_rate(C_train, train_cl.X.shape[0])
print("Validation dataset:")
caculate_noise_rate(C_validation, validation_cl.X.shape[0])
print("Test dataset")
caculate_noise_rate(C_test, test_cl.X.shape[0])

Train dataset:
Number of label issues: 511
Estimated noise rate: 45.8%
Validation dataset:
Number of label issues: 103
Estimated noise rate: 42.9%
Test dataset
Number of label issues: 116
Estimated noise rate: 48.5%


__Pruning label issues__

In [17]:
train_issue_indices = train_cl.find_label_issues()
validation_issue_indices = validation_cl.find_label_issues()
test_issue_indices = test_cl.find_label_issues()

[+] Finding labels issue indeces:
Issue indices: 511
[+] Finding labels issue indeces:
Issue indices: 103
[+] Finding labels issue indeces:
Issue indices: 116


In [18]:
# Remove the label errors found by Confident Learning
clean_X_train = np.delete(X_Train, train_issue_indices, axis=0) 
clean_y_train = np.delete(y_Train, train_issue_indices)
clean_train_pred_probs = np.delete(train_cl.pred_probs, train_issue_indices, axis=0)

clean_X_validation = np.delete(X_Validation, validation_issue_indices, axis=0) 
clean_y_validation = np.delete(y_Validation, validation_issue_indices)
clean_validation_pred_probs = np.delete(validation_cl.pred_probs, validation_issue_indices, axis=0)

clean_X_test = np.delete(X_Test, test_issue_indices, axis=0) 
clean_y_test = np.delete(y_Test, test_issue_indices)
clean_test_pred_probs = np.delete(test_cl.pred_probs, test_issue_indices, axis=0)

In [19]:
clean_train = train_dataset.drop(train_issue_indices)
clean_val = validation_dataset.drop(validation_issue_indices)
clean_test = test_dataset.drop(test_issue_indices)

__Check Confident joint again__

In [20]:
def compute_confident_joint(pred_probs: np.ndarray, thresholds: np.ndarray, labels: np.ndarray) -> np.ndarray:
    print("[+] Computing confident joint")
    n_examples, n_classes = pred_probs.shape
    confident_joint = np.zeros((n_classes, n_classes), dtype=np.int64)
    for data_idx in range(n_examples):
        i = labels[data_idx]    #y_noise
        j = None                #y_true -> to find
        #Lưu ý điểm mình bị sai: vị trí của chúng không ứng với label
        p_j = -1
        for candidate_j in range(n_classes):
            p = pred_probs[data_idx, candidate_j]
            if p >= thresholds[candidate_j] and p > p_j:
                j = candidate_j
                p_j = p
        if j is not None:
            confident_joint[i][j] += 1
    print("[-] Finished compute confident joint:")
    print(confident_joint)
    return confident_joint

In [21]:
print("Training dataset:")
clean_C_train = compute_confident_joint(pred_probs=clean_train_pred_probs, thresholds=train_thresholds, labels=clean_y_train)

print("Validation dataset")
clean_C_validation = compute_confident_joint(pred_probs=clean_validation_pred_probs, thresholds=validation_thresholds, labels=clean_y_validation)

print("Test dataset")
clean_C_test = compute_confident_joint(pred_probs=clean_test_pred_probs, thresholds=test_thresholds, labels=clean_y_test)

Training dataset:
[+] Computing confident joint
[-] Finished compute confident joint:
[[179   0]
 [  0 418]]
Validation dataset
[+] Computing confident joint
[-] Finished compute confident joint:
[[42  0]
 [ 0 86]]
Test dataset
[+] Computing confident joint
[-] Finished compute confident joint:
[[33  0]
 [ 0 90]]


### Comparing models with clean data

In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
model = XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
clean_train_acc = cross_val_score(model, clean_X_train, clean_y_train, scoring='accuracy', cv=5)
clean_validation_acc = cross_val_score(model, clean_X_validation, clean_y_validation, scoring='accuracy', cv=5)
clean_test_acc = cross_val_score(model, clean_X_test, clean_y_test, scoring='accuracy', cv=5)

print(np.mean(clean_train_acc), np.mean(clean_validation_acc), np.mean(clean_test_acc))

0.8824380165289256 0.8687830687830687 0.8220000000000001


In [23]:
from tabulate import tabulate

def show_activity_distribution(dataset):
    #Rows for specific labels
    active_rows = dataset.loc[dataset["Bioactivity"] == "active"]
    inactive_rows = dataset.loc[dataset["Bioactivity"] == "inactive"]
    dataset_length = len(dataset)
    print(f"Total dataset: {dataset_length}")
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length, len(inactive_rows)/dataset_length]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [24]:
show_activity_distribution(clean_train)
show_activity_distribution(clean_test)
show_activity_distribution(clean_val)

Total dataset: 604
╒════════════════╤════════════╤════════════╕
│                │     Active │   Inactive │
╞════════════════╪════════════╪════════════╡
│ Number         │ 182        │ 422        │
├────────────────┼────────────┼────────────┤
│ Percentage (%) │   0.301325 │   0.698675 │
╘════════════════╧════════════╧════════════╛
Total dataset: 123
╒════════════════╤═══════════╤════════════╕
│                │    Active │   Inactive │
╞════════════════╪═══════════╪════════════╡
│ Number         │ 33        │  90        │
├────────────────┼───────────┼────────────┤
│ Percentage (%) │  0.268293 │   0.731707 │
╘════════════════╧═══════════╧════════════╛
Total dataset: 137
╒════════════════╤═══════════╤════════════╕
│                │    Active │   Inactive │
╞════════════════╪═══════════╪════════════╡
│ Number         │ 46        │  91        │
├────────────────┼───────────┼────────────┤
│ Percentage (%) │  0.335766 │   0.664234 │
╘════════════════╧═══════════╧════════════╛


### Write to file

In [None]:
with pd.ExcelWriter("../../data/train_test_data/CL_then_balance/20240216_clean_data_approach1_method2.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    clean_train.to_excel(writer, sheet_name="train_dataset", index=False)
    clean_val.to_excel(writer, sheet_name="validation_dataset", index=False)
    clean_test.to_excel(writer, sheet_name="test_dataset", index=False)

## Use cleanlab to clean the data

### Use clean lab to find and pruning label errors

In [None]:
import cleanlab
from cleanlab.classification import CleanLearning
from cleanlab.benchmarking import noise_generation

In [None]:
cl_train = cleanlab.classification.CleanLearning(clf=XGBClassifier(), seed=42)
_ = cl_train.fit(X_Train, y_Train)

cl_test = cleanlab.classification.CleanLearning(clf=XGBClassifier(), seed=42)
_ = cl_test.fit(X_Test, y_Test)

cl_val = cleanlab.classification.CleanLearning(clf=XGBClassifier(), seed=42)
_ = cl_val.fit(X_Validation, y_Validation)

In [None]:
train_label_issues_df = cl_train.get_label_issues()
print(len(train_label_issues_df))
train_label_issues_df.head()

In [None]:
# Getting issue labels
train_label_issue = np.array(cl_train.get_label_issues()["is_label_issue"].values)
test_label_issue = np.array(cl_test.get_label_issues()["is_label_issue"].values)
validation_label_issue = np.array(cl_val.get_label_issues()["is_label_issue"].values)
#Get the issue index
train_issue_idx = (train_label_issue > 0).nonzero()[0]
test_issue_idx = (test_label_issue > 0).nonzero()[0]
valiadation_issue_idx = (validation_label_issue > 0).nonzero()[0]

In [None]:
# Pruning label errors
clean_X_train = np.delete(X_Train, train_issue_idx, axis=0) 
clean_y_train = np.delete(y_Train, train_issue_idx)

clean_X_validation = np.delete(X_Validation, valiadation_issue_idx, axis=0) 
clean_y_validation = np.delete(y_Validation, valiadation_issue_idx)

clean_X_test = np.delete(X_Test, test_issue_idx, axis=0) 
clean_y_test = np.delete(y_Test, test_issue_idx)

In [None]:
#Pruning label errors
clean_train = train_dataset.drop(train_issue_idx)
clean_val = validation_dataset.drop(valiadation_issue_idx)
clean_test = test_dataset.drop(test_issue_idx)

### Check the models' accuracy on clean data

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
model = XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
clean_train_acc = cross_val_score(model, clean_X_train, clean_y_train, scoring='accuracy', cv=5)
clean_validation_acc = cross_val_score(model, clean_X_validation, clean_y_validation, scoring='accuracy', cv=5)
clean_test_acc = cross_val_score(model, clean_X_test, clean_y_test, scoring='accuracy', cv=5)
print(np.mean(clean_train_acc), np.mean(clean_validation_acc), np.mean(clean_test_acc))

In [None]:
from tabulate import tabulate

def show_activity_distribution(dataset):
    #Rows for specific labels
    active_rows = dataset.loc[dataset["Bioactivity"] == "active"]
    inactive_rows = dataset.loc[dataset["Bioactivity"] == "inactive"]
    dataset_length = len(dataset)
    print(f"Total dataset: {dataset_length}")
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length, len(inactive_rows)/dataset_length]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [None]:
show_activity_distribution(clean_train)
show_activity_distribution(clean_test)
show_activity_distribution(clean_val)

### Write to file

In [None]:
with pd.ExcelWriter("../../data/train_test_data/CL/20240207_clean_data_with_cleanlab.xlsx") as writer:
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    clean_train.to_excel(writer, sheet_name="train_dataset", index=False)
    clean_val.to_excel(writer, sheet_name="validation_dataset", index=False)
    clean_test.to_excel(writer, sheet_name="test_dataset", index=False)