In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

# Read the data

In [2]:
data_version_1_path = "../data_for_modeling/raw_data/v1/Original Data - v1 - Merge.xlsx"
dataset = pd.read_excel(data_version_1_path, sheet_name='original_data')
dataset_length = len(dataset)
dataset_length

993

## For training with first label

In [3]:
# test_dataset_path = "../data_for_modeling/filter_data/v1/old_data/HDAC2_test_unspec_removed.csv"
# train_dataset_path = "../data_for_modeling/filter_data/v1/old_data/HDAC2_train_unspec_removed.csv"
# test_dataset = pd.read_csv(test_dataset_path)
# train_dataset = pd.read_csv(train_dataset_path)
# dataset = pd.concat([train_dataset, test_dataset], ignore_index=True)

In [4]:
dataset.head()

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,CSC1=CC2=C(C=C1)SC3=CC=CC=C3N2CC4=CC=C(C=C4)C(...,164629157,0.68,Active,Active,1
1,1,CC1=C(C2=CC=CC=C2N1)CCNCC3=CC=C(C=C3)C=CC(=O)N...,155525662,4.214,Active,Inactive,1
2,2,C1=CC=C2C(=C1)N(C3=C(S2=O)C=CC(=C3)C(F)(F)F)CC...,164627475,2.12,Active,Inactive,1
3,3,CC(C)(C)OC(=O)NC1=CC=C(C=C1)C2=CC(=NO2)NC(=O)C...,164627446,0.252,Active,Active,1
4,4,CCCC[C@@H](C1=NC=C(N1)C2=CC3=CC=CC=C3N=C2OC)NC...,164627330,2.00525,Active,Inactive,4


# 1. Profile of the data

## 1.1. Group by original activity

In [3]:
def check_activity_distribution(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "Active"]
    inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]
    inconclusive_rows = dataset.loc[dataset[col_name] == "Inconclusive"]
    unspecified_rows = dataset.loc[dataset[col_name] == "Unspecified"]

    dataset_length = len(dataset)

    print("Total dataset")
    table = [['', 'Active', 'Inactive', 'Inconclusive', 'Unspecified'], 
            ['Number', len(active_rows), len(inactive_rows), len(inconclusive_rows), len(unspecified_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100,
            len(inconclusive_rows)/dataset_length*100, len(unspecified_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [4]:
check_activity_distribution(dataset=dataset, col_name='FIRST_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 769      │   10       │              0 │      214      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  77.4421 │    1.00705 │              0 │       21.5509 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


## 1.2. Group by the new activity

In [5]:
check_activity_distribution(dataset=dataset, col_name='ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 501      │   492      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  50.4532 │    49.5468 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


# 2. Filtering the data

## 2.1. Cannonical Smiles and remove duplicates

In [9]:
def make_canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [10]:
cannon_smiles = make_canonical_smiles(dataset.SMILES)
dataset['SMILES'] = cannon_smiles
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicate_index = dataset[dataset['SMILES'].duplicated()]['SMILES'].index
duplicates_smiles

array(['CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1',
       'CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1',
       'O=C1CCC=CCCC(=O)N(Cc2cccc(C=NNC(=O)c3cccc(C(=O)NO)c3)c2)CC(c2ccccc2)O1',
       'CC(C)C1NC(=O)C2(C)CSC(=N2)c2csc(n2)CNC(=O)CC(C(F)=CCCS)OC1=O',
       'O=C(CCC(CCCC(=O)Nc1ccccc1)Cc1ccccc1)NO',
       'O=C(CCC(CCCC(=O)Nc1ccccc1)Cc1ccccc1)NO',
       'CCCCCCC(CCCCCC(=O)Nc1ccccc1)C(=O)NO',
       'COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1',
       'COc1ccc(COC(CCCCCC(=O)NO)C(=O)Nc2ccccc2)cc1'], dtype=object)

In [11]:
dataset.loc[270]

STT                                                    270
SMILES              CC(Nc1ccccc1)c1ccc(C(=O)Nc2ccccc2N)cc1
CID                                              155555330
IC50 (uM)                                              4.2
FIRST_LABEL                                         Active
ACTIVITY                                          Inactive
DUPLICATE_COUNTS                                         1
Name: 270, dtype: object

In [12]:
duplicate_index

Int64Index([252, 270, 290, 706, 912, 913, 917, 975, 977], dtype='int64')

In [13]:
dup_smiles_idxs = []
for smiles in duplicates_smiles:
    sub_dataset_dup_smiles = dataset[dataset.SMILES == smiles].copy()
    for idx, _ in sub_dataset_dup_smiles.iterrows():
        dup_smiles_idxs.append(idx)
print(dup_smiles_idxs)
print(len(dup_smiles_idxs))

[249, 252, 270, 249, 252, 270, 260, 290, 704, 706, 910, 912, 913, 910, 912, 913, 915, 917, 974, 975, 977, 974, 975, 977]
24


In [14]:
dataset = dataset.drop(dup_smiles_idxs)

In [15]:
#check again
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicates_smiles

array([], dtype=object)

In [16]:
len(dataset)

978

## 2.2. Check for label intersection

In [17]:
def check_label_intersection(dataset, col_name):
      active_rows = dataset.loc[dataset[col_name] == "Active"]
      inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]
      inconclusive_rows = dataset.loc[dataset[col_name] == "Inconclusive"]
      unspecified_rows = dataset.loc[dataset[col_name] == "Unspecified"]
      
      cid_active = active_rows.loc[:, 'CID']
      cid_inactive = inactive_rows.loc[:, 'CID']
      cid_incon = inconclusive_rows.loc[:, 'CID']
      cid_unspec = unspecified_rows.loc[:, 'CID']

      ac_inac_cid = np.intersect1d(cid_active, cid_inactive)
      ac_incon_cid = np.intersect1d(cid_active, cid_incon)
      ac_unspec_cid = np.intersect1d(cid_active, cid_unspec)

      inac_incon_cid = np.intersect1d(cid_inactive, cid_incon)
      incon_unspec_cid = np.intersect1d(cid_incon, cid_unspec)
      inac_unspec_cid = np.intersect1d(cid_inactive, cid_unspec)
      print(len(ac_inac_cid), 
            len(ac_incon_cid), 
            len(ac_unspec_cid), 
            len(inac_incon_cid), 
            len(incon_unspec_cid), 
            len(inac_unspec_cid))

In [18]:
check_label_intersection(dataset=dataset, col_name='ACTIVITY')

0 0 0 0 0 0


In [19]:
check_label_intersection(dataset=dataset, col_name='FIRST_LABEL')

0 0 0 0 0 0


In [20]:
dataset = dataset.reset_index(drop=True)
dataset.head()

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,CSc1ccc2c(c1)N(Cc1ccc(C(=O)NO)cc1)c1ccccc1S2,164629157,0.68,Active,Active,1
1,1,COC(=O)CCNNC(=O)C=Cc1ccc(CNCCc2c(C)[nH]c3ccccc...,155525662,4.214,Active,Inactive,1
2,2,O=C(NO)c1ccc(CN2c3ccccc3S(=O)c3ccc(C(F)(F)F)cc...,164627475,2.12,Active,Inactive,1
3,3,CC(C)(C)OC(=O)Nc1ccc(-c2cc(NC(=O)CCCCCCC(=O)NO...,164627446,0.252,Active,Active,1
4,4,CCCC[C@H](NC(=O)[C@H](CN)c1c(C)[nH]c2ccc(OC)cc...,164627330,2.00525,Active,Inactive,4


In [21]:
check_activity_distribution(dataset=dataset, col_name='FIRST_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 762      │   10       │              0 │      206      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  77.9141 │    1.02249 │              0 │       21.0634 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [22]:
check_activity_distribution(dataset=dataset, col_name='ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 499      │   479      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  51.0225 │    48.9775 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [23]:
# dataset.to_excel("../output/other/Original_dataset_after_remove_dup_smiles.xlsx")

## 2.3. Remove the unspecified rows

## 2.4. Write data to file before running label errors

In [24]:
len(dataset)

978

In [25]:
# dataset.to_excel("../data_for_modeling/filter_data/v1/before_clean_data/HDAC2_before_labels_errors.xlsx")

## 3. Labels errors

### 3.0. Encoding labels

In [26]:
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

#Encoding labels
dataset_c = dataset.copy()
# Transform letter grades and notes to categorical numbers.
# Necessary for XGBoost.
dataset['ACTIVITY'] = preprocessing.LabelEncoder().fit_transform(dataset['ACTIVITY'])
dataset.head()

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,CSc1ccc2c(c1)N(Cc1ccc(C(=O)NO)cc1)c1ccccc1S2,164629157,0.68,Active,0,1
1,1,COC(=O)CCNNC(=O)C=Cc1ccc(CNCCc2c(C)[nH]c3ccccc...,155525662,4.214,Active,1,1
2,2,O=C(NO)c1ccc(CN2c3ccccc3S(=O)c3ccc(C(F)(F)F)cc...,164627475,2.12,Active,1,1
3,3,CC(C)(C)OC(=O)Nc1ccc(-c2cc(NC(=O)CCCCCCC(=O)NO...,164627446,0.252,Active,0,1
4,4,CCCC[C@H](NC(=O)[C@H](CN)c1c(C)[nH]c2ccc(OC)cc...,164627330,2.00525,Active,1,4


In [27]:
dataset_c.head()

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
0,0,CSc1ccc2c(c1)N(Cc1ccc(C(=O)NO)cc1)c1ccccc1S2,164629157,0.68,Active,Active,1
1,1,COC(=O)CCNNC(=O)C=Cc1ccc(CNCCc2c(C)[nH]c3ccccc...,155525662,4.214,Active,Inactive,1
2,2,O=C(NO)c1ccc(CN2c3ccccc3S(=O)c3ccc(C(F)(F)F)cc...,164627475,2.12,Active,Inactive,1
3,3,CC(C)(C)OC(=O)Nc1ccc(-c2cc(NC(=O)CCCCCCC(=O)NO...,164627446,0.252,Active,Active,1
4,4,CCCC[C@H](NC(=O)[C@H](CN)c1c(C)[nH]c2ccc(OC)cc...,164627330,2.00525,Active,Inactive,4


**We will do this in the MACCS keys**

In [28]:
def maccs_fpts(data):
    Maccs_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i)
        fpts = MACCSkeys.GenMACCSKeys(mol)
        mfpts = np.array(fpts)
        Maccs_fpts.append(mfpts)
    return np.array(Maccs_fpts)

In [29]:
smiles = dataset.SMILES
data = maccs_fpts(smiles)
data = pd.DataFrame(data=data)
labels = dataset['ACTIVITY']
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0


### 3.1. Getting out-of-sample predicted probabilities

In [30]:
model = XGBClassifier(tree_method="hist", enable_categorical=True)
pred_probs = cross_val_predict(model, data, labels, method='predict_proba')
print(len(pred_probs))
print(pred_probs)

978
[[0.05645907 0.94354093]
 [0.9060659  0.0939341 ]
 [0.9896945  0.01030552]
 ...
 [0.00843608 0.9915639 ]
 [0.06708306 0.93291694]
 [0.0323478  0.9676522 ]]


### 3.2. Checking model accuracy on original data

Now that we have out-of-sample predicted probabilities, we can also check the model's (cross-val) accuracy on the original (noisy) data, so we'll have a baseline to compare our final results.

In [31]:
preds = np.argmax(pred_probs, axis=1)
acc_original = accuracy_score(preds, labels)
print(f"Accuracy with original data: {round(acc_original*100,1)}%")

Accuracy with original data: 63.0%


In [32]:
model.fit(data, labels)
preds_by_predict = model.predict(data)
acc_pred_by_predict = accuracy_score(preds_by_predict, labels)
print(f"Accuracy with original data: {round(acc_pred_by_predict*100,1)}%")

Accuracy with original data: 97.0%


In [33]:
# k = 158
# print("Label: " + str(labels[k]))
# print("Pred by proba: " + str(preds[k]))
# print("Preds by predict: " + str(preds_by_predict[k]))
# print("Pred probs: "+str(pred_probs[k]))

In [34]:
# print_pred_probs = pd.DataFrame(pred_probs, columns=[0, 1])
# print_pred_probs['labels'] = labels
# print_pred_probs['Prediction by predict()'] = preds_by_predict
# print_pred_probs['Prediction by predict_proba()'] = preds
# print_pred_probs.to_excel("../data_for_modeling/filter_data/v1/before_clean_data/Compare_between_preds.xlsx", index=True)

### 3.3. Finding the class threshold

In [35]:
def compute_class_thresholds(pred_probs: np.ndarray, labels: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    thresholds = np.zeros(n_classes)
    for k in range(n_classes):
        count = 0
        p_sum = 0
        for i in range(n_examples):
            if labels[i] == k:
                count += 1
                p_sum += pred_probs[i, k]
        thresholds[k] = p_sum / count
    return thresholds

<b>Check the data and its label was right</b>

In [36]:
print(dataset_c.loc[230]['ACTIVITY'])
print("label: " + str(labels.to_numpy()[230]))

Active
label: 0


In [37]:
# should be a numpy array of length 5
thresholds = compute_class_thresholds(pred_probs, labels.to_numpy())
thresholds

array([0.61792817, 0.61778003])

### 3.4. Constructing the confident joint

In [38]:
def compute_confident_joint(pred_probs: np.ndarray, labels: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    confident_joint = np.zeros((n_classes, n_classes), dtype=np.int64)
    positions = np.array([[-1, -1]])
    for data_idx in range(n_examples):
        i = labels[data_idx]    #y_noise
        j = None                #y_true -> to find
        #Lưu ý điểm mình bị sai: vị trí của chúng không ứng với label
        p_j = -1
        for candidate_j in range(n_classes):
            p = pred_probs[data_idx, candidate_j]
            if p >= thresholds[candidate_j] and p > p_j:
                j = candidate_j
                p_j = p
        if j is not None:
            confident_joint[i][j] += 1
        positions = np.append(positions, np.array([[i, j]]), axis=0)
    return confident_joint, positions

In [39]:
C, positions = compute_confident_joint(pred_probs, labels.to_numpy(), thresholds)
print(C)
# print(positions)

[[290 165]
 [163 278]]
[[-1 -1]
 [0 1]
 [1 0]
 ...
 [1 1]
 [1 1]
 [1 1]]


In [40]:
positions = pd.DataFrame(positions, columns=["i", "j"]).to_excel("../data_for_modeling/filter_data/v1/before_clean_data/positions.xlsx")

### 3.5 Count the number of label issues

In [41]:
num_label_issues = C.sum() - C.trace()
num_label_issues

328

In [42]:
print('Estimated noise rate: {:.1f}%'.format(100*num_label_issues / pred_probs.shape[0]))

Estimated noise rate: 33.5%


### 3.6. Filter out label issues

In [43]:
pred_probs.shape

(978, 2)

In [44]:
self_confidences = []
for i in range(pred_probs.shape[0]):
    self_confidences.append(pred_probs[i, labels[i]])
self_confidences = np.array(self_confidences)

In [45]:
ranked_indices = np.argsort(self_confidences)
ranked_indices[0:5]

array([158,  45,  13, 159, 727])

In [46]:
issue_idx = ranked_indices[:num_label_issues]
print(len(issue_idx))
issue_idx[0]

328


158

In [47]:
print(np.where(issue_idx == 649))

(array([187]),)


In [48]:
print_issue_index = pd.DataFrame(issue_idx)
print_issue_index.to_csv("../output/other/issue_idx.csv")

In [49]:
dataset_c.iloc[ranked_indices[:5]]

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
158,158,CC1(C)C(=O)C(C#N)=C[C@]2(C)[C@H]3C(=O)C=C4[C@@...,162651274,4.83,Active,Inactive,1
45,45,CCC(=O)CCCCC[C@H](NC(=O)[C@H]1CC12CCN(CC)CC2)c...,162676857,1.553,Active,Inactive,1
13,13,O=C(O)CCCCCSc1nnc(-c2ccncc2)n1-c1ccccc1,164621264,0.64,Active,Active,1
159,159,CCC(=O)CCCCC[C@H](NC(=O)[C@H]1CC12CCN(C)CC2)c1...,162651205,1.336667,Active,Inactive,3
727,734,Nc1ccc(-c2cccs2)cc1NC(=O)c1ccc2nccnc2c1,11393847,0.2,Active,Active,1


# Train a More Robust Model

Now that we have the indices of potential label errors within our data, let's remove them from our data, retrain our model, and see what improvement we can gain.

Keep in mind that our baseline model from above, trained on the original data using the `noisy_letter_grade` as the prediction label, achieved a cross-validation accuracy of 67%.

Let's use a very simple method to handle these label errors and just drop them entirely from the data and retrain our exact same `XGBClassifier`. In a real-world application, a better approach might be to have humans review the issues and _correct_ the labels rather than dropping the data points.

In [50]:
# Remove the label errors found by Confident Learning
data_rm = dataset.drop(issue_idx)
# encoding the label
clean_labels = preprocessing.LabelEncoder().fit_transform(data_rm['ACTIVITY'])
# encoding the smiles
data = maccs_fpts(data_rm.SMILES)
data = pd.DataFrame(data=data)

In [51]:
len(data)

650

In [52]:
len(clean_labels)

650

In [53]:
# Train a more robust classifier with less erroneous data
model = XGBClassifier(tree_method="hist", enable_categorical=True)
clean_pred_probs = cross_val_predict(model, data, clean_labels, method='predict_proba')
clean_preds = np.argmax(clean_pred_probs, axis=1)

acc_clean = accuracy_score(clean_preds, clean_labels)
print(f"Accuracy with original data: {round(acc_original*100, 1)}%")
print(f"Accuracy with errors found by Confident Learning removed: {round(acc_clean*100, 1)}%")

# Compute reduction in error.
err = ((1-acc_original)-(1-acc_clean))/(1-acc_original)
print(f"Reduction in error: {round(err*100,1)}%")

Accuracy with original data: 63.0%
Accuracy with errors found by Confident Learning removed: 89.2%
Reduction in error: 70.9%


# 4. Using clean lab to find the label issues

In [54]:
import cleanlab
cl_issue_idx = cleanlab.filter.find_label_issues(labels, pred_probs, return_indices_ranked_by='self_confidence')

In [55]:
len(cl_issue_idx)

358

# 5. Output the new data after filter

In [56]:
clean_dataset = dataset_c.drop(issue_idx)
issue_dataset = dataset_c.loc[issue_idx]

In [57]:
issue_dataset

Unnamed: 0,STT,SMILES,CID,IC50 (uM),FIRST_LABEL,ACTIVITY,DUPLICATE_COUNTS
158,158,CC1(C)C(=O)C(C#N)=C[C@]2(C)[C@H]3C(=O)C=C4[C@@...,162651274,4.830000,Active,Inactive,1
45,45,CCC(=O)CCCCC[C@H](NC(=O)[C@H]1CC12CCN(CC)CC2)c...,162676857,1.553000,Active,Inactive,1
13,13,O=C(O)CCCCCSc1nnc(-c2ccncc2)n1-c1ccccc1,164621264,0.640000,Active,Active,1
159,159,CCC(=O)CCCCC[C@H](NC(=O)[C@H]1CC12CCN(C)CC2)c1...,162651205,1.336667,Active,Inactive,3
727,734,Nc1ccc(-c2cccs2)cc1NC(=O)c1ccc2nccnc2c1,11393847,0.200000,Active,Active,1
...,...,...,...,...,...,...,...
491,496,Cc1c2ccc(N(C)c3ccnc(Nc4ccc(C(=O)NCCCCCCC(=O)Nc...,141504714,5.000000,Active,Inactive,1
492,497,Cc1c2ccc(N(C)c3ccnc(Nc4ccc(C(=O)NCCCCCCCC(=O)N...,141504704,4.980000,Active,Inactive,2
946,958,COC(=O)N(O)CCCCCCC(=O)Nc1ccccc1,44359814,,Unspecified,Inactive,1
602,607,COc1ccc(S(=O)(=O)n2ccc3cc(C=CC(=O)NO)ccc32)cc1OC,53464638,0.002300,Active,Active,1


## 5.1. Check the profile of the clean data

In [58]:
check_activity_distribution(dataset=clean_dataset, col_name='FIRST_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 469      │   10       │              0 │      171      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  72.1538 │    1.53846 │              0 │       26.3077 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [59]:
check_activity_distribution(dataset=clean_dataset, col_name='ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 334      │   316      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  51.3846 │    48.6154 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [60]:
check_activity_distribution(dataset=issue_dataset, col_name='FIRST_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 293      │          0 │              0 │       35      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  89.3293 │          0 │              0 │       10.6707 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [61]:
check_activity_distribution(dataset=issue_dataset, col_name='ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 165      │   163      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  50.3049 │    49.6951 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


## 5.2. Write data to file

In [62]:
from openpyxl import Workbook
data_path = "../data_for_modeling/filter_data/v1/clean_data/"
writer = pd.ExcelWriter(data_path + 'HDAC2_clean_data.xlsx', engine='openpyxl')
dataset_c.to_excel(writer, sheet_name='all_data', index=False)
clean_dataset.to_excel(writer, sheet_name='clean_data', index=False)
issue_dataset.to_excel(writer, sheet_name='issue_data', index=False)
writer.close()

# 6. Create a test set and train set from this

## 6.1. Create clean data

In [73]:
train_test_data = clean_dataset.drop(['STT', 'FIRST_LABEL', 'DUPLICATE_COUNTS'], axis = 1)
len(train_test_data)

650

In [64]:
from sklearn.model_selection import train_test_split
random_state = 1
train_dataset, test_dataset = train_test_split(train_test_data, test_size=0.2, random_state=random_state)

In [65]:
check_activity_distribution(dataset=train_dataset, col_name='ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 275      │   245      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  52.8846 │    47.1154 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [66]:
check_activity_distribution(dataset=test_dataset, col_name='ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │  59      │    71      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  45.3846 │    54.6154 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [67]:
with pd.ExcelWriter(data_path+'HDAC2_train_test_clean_data.xlsx', engine='openpyxl') as writer:
    train_dataset.to_excel(writer, sheet_name='train_dataset', index=False)
    test_dataset.to_excel(writer, sheet_name='test_dataset', index=False)

## 6.2. Create unclean data

In [68]:
len(dataset_c)

978

In [69]:
unclean_train_test_data = dataset_c.drop(['STT', 'FIRST_LABEL', 'DUPLICATE_COUNTS'], axis = 1)
random_state = 1
unclean_train_dataset, unclean_test_dataset = train_test_split(unclean_train_test_data, test_size=0.2, random_state=random_state)

In [70]:
check_activity_distribution(unclean_train_dataset, 'ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 408      │   374      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  52.1739 │    47.8261 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [71]:
check_activity_distribution(unclean_test_dataset, 'ACTIVITY')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │  91      │   105      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  46.4286 │    53.5714 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [72]:
with pd.ExcelWriter(data_path+'HDAC2_train_test_unclean_data.xlsx', engine='openpyxl') as writer:
    unclean_train_dataset.to_excel(writer, sheet_name='train_dataset', index=False)
    unclean_test_dataset.to_excel(writer, sheet_name='test_dataset', index=False)