In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

# Read the data

In [2]:
all_data_path = "../../data_for_modeling/filter_data/all_data/preprocessing_data/HDAC2_all_data_filtered.xlsx"
# version1_data_path = "../../data_for_modeling/raw_data/v1/HDAC2_original_data_v1.xlsx"
dataset = pd.read_excel(all_data_path, sheet_name='filter_data')
dataset_length = len(dataset)
dataset_length

2086

In [3]:
dataset.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,2.2325,Active,Inactive,12
1,3812,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCCC(=O)NO,1.306667,Active,Inactive,3
2,3994,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCC(=O)NO,1.256667,Active,Inactive,3
3,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,1.526316,Active,Inactive,19
4,5173,C(CCCC(=O)NO)CCC(=O)NO,8.23,Active,Inactive,1


# 1. Profile of the data

## 1.1. Group by original activity

In [4]:
def check_activity_distribution(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "Active"]
    inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]
    inconclusive_rows = dataset.loc[dataset[col_name] == "Inconclusive"]
    unspecified_rows = dataset.loc[dataset[col_name] == "Unspecified"]

    dataset_length = len(dataset)

    print("Total dataset")
    table = [['', 'Active', 'Inactive', 'Inconclusive', 'Unspecified'], 
            ['Number', len(active_rows), len(inactive_rows), len(inconclusive_rows), len(unspecified_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100,
            len(inconclusive_rows)/dataset_length*100, len(unspecified_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

__Train on the final label, not the first label__

In [5]:
check_activity_distribution(dataset=dataset, col_name='FINAL_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 1059     │   1027     │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │   50.767 │     49.233 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


# 2. Check the data

## 2.1. Check for duplicates

In [6]:
dataset_c = dataset.copy()
print(len(dataset_c))

2086


### Standardlize SMILES (not in use)

In [7]:
# from rdkit.Chem.MolStandardize import rdMolStandardize
# def make_canonical_smiles(smiles):
#     smiles = [rdMolStandardize.StandardizeSmiles(smi) for smi in smiles]
#     return smiles

In [8]:
# cannon_smiles = make_canonical_smiles(dataset.SMILES)
# dataset['SMILES'] = cannon_smiles
# duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
# duplicate_index = dataset[dataset['SMILES'].duplicated()]['SMILES'].index
# duplicates_smiles

### Find the duplicate SMILES

In [9]:
duplicates_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
duplicate_index = dataset[dataset['SMILES'].duplicated()]['SMILES'].index
dup_smiles_entries = pd.DataFrame()
for smiles in duplicates_smiles:
    sub_dataset_dup_smiles = dataset[dataset.SMILES == smiles].copy()
    dup_smiles_entries = pd.concat([dup_smiles_entries, sub_dataset_dup_smiles], axis=0)
print(len(dup_smiles_entries))
dataset.loc[dup_smiles_entries.index][0:5]

0


Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS


In [10]:
if(len(dup_smiles_entries) > 0):
    dataset = dataset.drop(dup_smiles_entries.index)
    dataset = dataset.reset_index(drop=True)
len(dataset)

2086

## 2.2. Check for label intersection

In [11]:
def check_label_intersection(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "Active"]
    inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]
    inconclusive_rows = dataset.loc[dataset[col_name] == "Inconclusive"]
    unspecified_rows = dataset.loc[dataset[col_name] == "Unspecified"]
    
    cid_active = active_rows.loc[:, 'SMILES']
    cid_inactive = inactive_rows.loc[:, 'SMILES']
    cid_incon = inconclusive_rows.loc[:, 'SMILES']
    cid_unspec = unspecified_rows.loc[:, 'SMILES']

    ac_inac_cid = np.intersect1d(cid_active, cid_inactive)
    ac_incon_cid = np.intersect1d(cid_active, cid_incon)
    ac_unspec_cid = np.intersect1d(cid_active, cid_unspec)

    inac_incon_cid = np.intersect1d(cid_inactive, cid_incon)
    incon_unspec_cid = np.intersect1d(cid_incon, cid_unspec)
    inac_unspec_cid = np.intersect1d(cid_inactive, cid_unspec)
    
    print("Activity intersection:")
    table = [['Active-Inactive', 'Active-Inconclusive', 'Active-Unspecified', 'Inactive-Inconclusive', 'Inactive-Unspecified', 'Inconclusive-Unspecifid'], 
             [len(ac_inac_cid), len(ac_incon_cid), len(ac_unspec_cid), len(inac_incon_cid), len(inac_unspec_cid), len(incon_unspec_cid)]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [12]:
check_label_intersection(dataset=dataset, col_name='FINAL_LABEL')

Activity intersection:
╒═══════════════════╤═══════════════════════╤══════════════════════╤═════════════════════════╤════════════════════════╤═══════════════════════════╕
│   Active-Inactive │   Active-Inconclusive │   Active-Unspecified │   Inactive-Inconclusive │   Inactive-Unspecified │   Inconclusive-Unspecifid │
╞═══════════════════╪═══════════════════════╪══════════════════════╪═════════════════════════╪════════════════════════╪═══════════════════════════╡
│                 0 │                     0 │                    0 │                       0 │                      0 │                         0 │
╘═══════════════════╧═══════════════════════╧══════════════════════╧═════════════════════════╧════════════════════════╧═══════════════════════════╛


# 3. Labels errors

## 3.0. Encoding labels

In [13]:
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

#Encoding labels
dataset_c = dataset.copy()
# Transform letter grades and notes to categorical numbers.
# Necessary for XGBoost.
dataset['FINAL_LABEL'] = preprocessing.LabelEncoder().fit_transform(dataset['FINAL_LABEL'])
dataset.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,2.2325,Active,1,12
1,3812,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCCC(=O)NO,1.306667,Active,1,3
2,3994,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCC(=O)NO,1.256667,Active,1,3
3,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,1.526316,Active,1,19
4,5173,C(CCCC(=O)NO)CCC(=O)NO,8.23,Active,1,1


In [14]:
dataset_c.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,2.2325,Active,Inactive,12
1,3812,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCCC(=O)NO,1.306667,Active,Inactive,3
2,3994,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCC(=O)NO,1.256667,Active,Inactive,3
3,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,1.526316,Active,Inactive,19
4,5173,C(CCCC(=O)NO)CCC(=O)NO,8.23,Active,Inactive,1


**We will do this in the MACCS keys**

In [15]:
def maccs_fpts(data):
    Maccs_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i)
        fpts = MACCSkeys.GenMACCSKeys(mol)
        mfpts = np.array(fpts)
        Maccs_fpts.append(mfpts)
    return np.array(Maccs_fpts)

In [16]:
smiles = dataset.SMILES
data = maccs_fpts(smiles)
data = pd.DataFrame(data=data)
labels = dataset['FINAL_LABEL']
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,1,0,0


## 3.1. Getting out-of-sample predicted probabilities

In [17]:
model = XGBClassifier(tree_method="hist", enable_categorical=True)
pred_probs = cross_val_predict(model, data, labels, method='predict_proba')
print(len(pred_probs))
print(pred_probs[0:5])

2086
[[0.00619262 0.9938074 ]
 [0.5045182  0.4954818 ]
 [0.5045182  0.4954818 ]
 [0.3716309  0.6283691 ]
 [0.20515424 0.79484576]]


## 3.2. Checking model accuracy on original data

Now that we have out-of-sample predicted probabilities, we can also check the model's (cross-val) accuracy on the original (noisy) data, so we'll have a baseline to compare our final results.

In [18]:
preds = np.argmax(pred_probs, axis=1)
acc_original = accuracy_score(preds, labels)
print(f"Accuracy with original data: {round(acc_original*100,1)}%")

Accuracy with original data: 63.4%


In [19]:
model.fit(data, labels)
preds_by_predict = model.predict(data)
acc_pred_by_predict = accuracy_score(preds_by_predict, labels)
print(f"Accuracy with original data: {round(acc_pred_by_predict*100,1)}%")

Accuracy with original data: 96.6%


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
#xg_boost
cv = KFold(n_splits=10, random_state=1, shuffle=True)
bst_scores = cross_val_score(model, data, labels, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation XG_Boost: %.3f (%.3f)' % (bst_scores.mean(), bst_scores.std()))
sorted(bst_scores)

Độ chính xác của 10-fold cross validation XG_Boost: 0.819 (0.025)


[0.7894736842105263,
 0.7932692307692307,
 0.7942583732057417,
 0.8086124401913876,
 0.8125,
 0.8125,
 0.8221153846153846,
 0.8373205741626795,
 0.8564593301435407,
 0.8660287081339713]

## 3.3. Finding the class threshold

In [21]:
def compute_class_thresholds(pred_probs: np.ndarray, labels: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    thresholds = np.zeros(n_classes)
    for k in range(n_classes):
        count = 0
        p_sum = 0
        for i in range(n_examples):
            if labels[i] == k:
                count += 1
                p_sum += pred_probs[i, k]
        thresholds[k] = p_sum / count
    return thresholds

<b>Check the data and its label was right</b>

In [22]:
print(dataset_c.loc[230]['FINAL_LABEL'])
print("label: " + str(labels.to_numpy()[230]))

Active
label: 0


In [23]:
# should be a numpy array of length 5
thresholds = compute_class_thresholds(pred_probs, labels.to_numpy())
thresholds

array([0.59714356, 0.63403434])

## 3.4. Constructing the confident joint

In [24]:
def compute_confident_joint(pred_probs: np.ndarray, labels: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
    n_examples, n_classes = pred_probs.shape
    confident_joint = np.zeros((n_classes, n_classes), dtype=np.int64)
    positions = np.array([[-1, -1]])
    for data_idx in range(n_examples):
        i = labels[data_idx]    #y_noise
        j = None                #y_true -> to find
        #Lưu ý điểm mình bị sai: vị trí của chúng không ứng với label
        p_j = -1
        for candidate_j in range(n_classes):
            p = pred_probs[data_idx, candidate_j]
            if p >= thresholds[candidate_j] and p > p_j:
                j = candidate_j
                p_j = p
        if j is not None:
            confident_joint[i][j] += 1
        positions = np.append(positions, np.array([[i, j]]), axis=0)
    return confident_joint, positions

In [25]:
C, positions = compute_confident_joint(pred_probs, labels.to_numpy(), thresholds)
print(C)
# print(positions)

[[605 335]
 [321 601]]


## 3.5 Count the number of label issues

In [26]:
num_label_issues = C.sum() - C.trace()
num_label_issues

656

In [27]:
print('Estimated noise rate: {:.1f}%'.format(100*num_label_issues / pred_probs.shape[0]))

Estimated noise rate: 31.4%


## 3.6. Filter out label issues

In [28]:
pred_probs.shape

(2086, 2)

In [29]:
self_confidences = []
for i in range(pred_probs.shape[0]):
    self_confidences.append(pred_probs[i, labels[i]])
self_confidences = np.array(self_confidences)

In [30]:
ranked_indices = np.argsort(self_confidences)
ranked_indices[0:5]

array([1128, 1376, 1270,  676, 1132])

In [31]:
issue_idx = ranked_indices[:num_label_issues]
print(len(issue_idx))

656


In [32]:
dataset_c.iloc[ranked_indices[:5]]

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
1128,155529983,COC1=CC(=C(C=C1)OC)CNCC2=CC=C(C=C2)/C=C/C(=O)NO,1.75,Active,Inactive,1
1376,162676857,CCC(=O)CCCCC[C@@H](C1=NC=C(N1)C2=C(C=CC(=C2F)F...,1.55,Active,Inactive,1
1270,162651205,CCC(=O)CCCCC[C@@H](C1=C(N=C(N1)C2=C(N=CC=C2)OC...,1.336667,Active,Inactive,3
676,86344268,C1=CC=C(C=C1)S(=O)(=O)C2=[N+](ON=C2OCCCC(=O)NO...,0.05,Active,Active,1
1132,155532052,COC1=C(C(=CC=C1)OC)CNCC2=CC=C(C=C2)/C=C/C(=O)NO,1.08,Active,Inactive,1


In [33]:
dataset_c.loc[dataset_c['CID'] == 141522811].index

Int64Index([1904], dtype='int64')

# 4. Train a More Robust Model

Now that we have the indices of potential label errors within our data, let's remove them from our data, retrain our model, and see what improvement we can gain.

Keep in mind that our baseline model from above, trained on the original data using the `noisy_letter_grade` as the prediction label, achieved a cross-validation accuracy of 67%.

Let's use a very simple method to handle these label errors and just drop them entirely from the data and retrain our exact same `XGBClassifier`. In a real-world application, a better approach might be to have humans review the issues and _correct_ the labels rather than dropping the data points.

In [34]:
# Remove the label errors found by Confident Learning
data_rm = dataset.drop(issue_idx)
# encoding the label
clean_labels = preprocessing.LabelEncoder().fit_transform(data_rm['FINAL_LABEL'])
# encoding the smiles
data = maccs_fpts(data_rm.SMILES)
data = pd.DataFrame(data=data)

In [35]:
len(data)

1430

In [36]:
len(clean_labels)

1430

In [37]:
# Train a more robust classifier with less erroneous data
model = XGBClassifier(tree_method="hist", enable_categorical=True)
clean_pred_probs = cross_val_predict(model, data, clean_labels, method='predict_proba')
clean_preds = np.argmax(clean_pred_probs, axis=1)

acc_clean = accuracy_score(clean_preds, clean_labels)
print(f"Accuracy with original data: {round(acc_original*100, 1)}%")
print(f"Accuracy with errors found by Confident Learning removed: {round(acc_clean*100, 1)}%")

# Compute reduction in error.
err = ((1-acc_original)-(1-acc_clean))/(1-acc_original)
print(f"Reduction in error: {round(err*100,1)}%")

Accuracy with original data: 63.4%
Accuracy with errors found by Confident Learning removed: 91.6%
Reduction in error: 77.1%


__Cross validation test__

In [38]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
#xg_boost
cv = KFold(n_splits=10, random_state=1, shuffle=True)
bst_scores = cross_val_score(model, data, clean_labels, scoring='accuracy', cv=cv, n_jobs=-1)
print('Độ chính xác của 10-fold cross validation XG_Boost: %.3f (%.3f)' % (bst_scores.mean(), bst_scores.std()))
sorted(bst_scores)

Độ chính xác của 10-fold cross validation XG_Boost: 0.955 (0.015)


[0.9230769230769231,
 0.9300699300699301,
 0.951048951048951,
 0.958041958041958,
 0.958041958041958,
 0.958041958041958,
 0.965034965034965,
 0.965034965034965,
 0.965034965034965,
 0.972027972027972]

# 5. Using clean lab to find the label issues

In [40]:
import cleanlab
cl_issue_idx = cleanlab.filter.find_label_issues(labels, pred_probs, return_indices_ranked_by='self_confidence')

In [41]:
len(cl_issue_idx)

734

# 6. Create clean train-test set from clean data

In [42]:
clean_dataset = dataset_c.drop(issue_idx)
issue_dataset = dataset_c.loc[issue_idx]

In [43]:
print(len(issue_dataset))
issue_dataset.head()

656


Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
1128,155529983,COC1=CC(=C(C=C1)OC)CNCC2=CC=C(C=C2)/C=C/C(=O)NO,1.75,Active,Inactive,1
1376,162676857,CCC(=O)CCCCC[C@@H](C1=NC=C(N1)C2=C(C=CC(=C2F)F...,1.55,Active,Inactive,1
1270,162651205,CCC(=O)CCCCC[C@@H](C1=C(N=C(N1)C2=C(N=CC=C2)OC...,1.336667,Active,Inactive,3
676,86344268,C1=CC=C(C=C1)S(=O)(=O)C2=[N+](ON=C2OCCCC(=O)NO...,0.05,Active,Active,1
1132,155532052,COC1=C(C(=CC=C1)OC)CNCC2=CC=C(C=C2)/C=C/C(=O)NO,1.08,Active,Inactive,1


In [44]:
len(clean_dataset)

1430

## 6.1. Check the profile of the clean data

In [45]:
check_activity_distribution(dataset=clean_dataset, col_name='FINAL_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │      715 │        715 │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │       50 │         50 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


## 6.2. Write data to file

In [46]:
clean_data_path = "../../data_for_modeling/filter_data/all_data/clean_data/"
unclean_data_path = "../../data_for_modeling/filter_data/all_data/unclean_data/"
other_data_path = "../../data_for_modeling/filter_data/all_data/others/"

In [47]:
pred_probs.shape

(2086, 2)

In [48]:
pred_probs_df = pd.DataFrame(pred_probs, columns=['Active probality', 'Inactive probality'])
pred_probs_df = pd.concat([pred_probs_df, dataset_c['FINAL_LABEL']], axis=1)
pred_probs_df.head()

Unnamed: 0,Active probality,Inactive probality,FINAL_LABEL
0,0.006193,0.993807,Inactive
1,0.504518,0.495482,Inactive
2,0.504518,0.495482,Inactive
3,0.371631,0.628369,Inactive
4,0.205154,0.794846,Inactive


In [49]:
from openpyxl import Workbook

writer = pd.ExcelWriter(other_data_path + 'HDAC2_all_clean_data_and_issue_dataset.xlsx', engine='openpyxl')
dataset_c.to_excel(writer, sheet_name='all_data', index=False)
clean_dataset.to_excel(writer, sheet_name='clean_data', index=False)
issue_dataset.to_excel(writer, sheet_name='issue_data', index=False)
pred_probs_df.to_excel(writer, sheet_name='prediction_probality', index=False)
writer.close()

## 6.3. Create clean data

In [50]:
random_state = 42

In [51]:
train_test_data = clean_dataset.drop(['FIRST_LABEL', 'DUPLICATE_COUNTS'], axis = 1)
len(train_test_data)

1430

In [52]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(train_test_data, test_size=0.2, random_state=random_state)

In [53]:
train_dataset.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FINAL_LABEL
450,59756717,C[C@H](C1=CC=CC=C1)NC2=NC=NC3=C2C=C(N3)C4=CC=C...,0.1,Active
1719,71462355,CCC(=O)CCCCC[C@H]1C(=O)N(CC(=O)N[C@H](CC(=O)N(...,10.0,Inactive
58,10246779,C1=CC=C(C=C1)CNCCN2C=CC(=N2)C3=CC=C(S3)C(=O)NO,0.03,Active
303,44138032,C1=CC(=CC=C1CO/N=C/C2=CC=C(C=C2)/C=C/C(=O)NO)[...,2.67,Inactive
1455,166630119,COC1=CC(=C(C=C1)C2=NC=C(S2)C(=O)NCCCCC(=O)NO)OC,0.69,Active


In [54]:
check_activity_distribution(dataset=train_dataset, col_name='FINAL_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 570      │   574      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  49.8252 │    50.1748 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [55]:
check_activity_distribution(dataset=test_dataset, col_name='FINAL_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 145      │   141      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  50.6993 │    49.3007 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [56]:
with pd.ExcelWriter(clean_data_path+'HDAC2_train_test_clean_data_all.xlsx', engine='openpyxl') as writer:
    train_dataset.to_excel(writer, sheet_name='train_dataset', index=False)
    test_dataset.to_excel(writer, sheet_name='test_dataset', index=False)

## 6.2. Create unclean data

In [57]:
len(dataset_c)

2086

In [58]:
unclean_train_test_data = dataset_c.drop(['FIRST_LABEL', 'DUPLICATE_COUNTS'], axis = 1)
unclean_train_dataset, unclean_test_dataset = train_test_split(unclean_train_test_data, test_size=0.25, random_state=random_state)

In [59]:
check_activity_distribution(unclean_train_dataset, 'FINAL_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 788      │   776      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  50.3836 │    49.6164 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [60]:
check_activity_distribution(unclean_test_dataset, 'FINAL_LABEL')

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 271      │   251      │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │  51.9157 │    48.0843 │              0 │             0 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


In [61]:
with pd.ExcelWriter(unclean_data_path+'HDAC2_train_test_unclean_data_all.xlsx', engine='openpyxl') as writer:
    unclean_train_dataset.to_excel(writer, sheet_name='train_dataset', index=False)
    unclean_test_dataset.to_excel(writer, sheet_name='test_dataset', index=False)