In [142]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

from mordred import Calculator, descriptors

# Thông tin ban đầu về bộ dữ liệu

## 1. Đọc dữ liệu

In [143]:
all_data_path = "../../data_for_modeling/raw_data/all_data/HDAC2_ALL_DATA.xlsx"
# version1_data_path = "../../data_for_modeling/raw_data/v1/HDAC2_original_data_v1.xlsx"
dataset = pd.read_excel(all_data_path, sheet_name='original_data')

In [144]:
dataset_c = dataset.copy() #Get a copy of the original dataset
print(len(dataset))
dataset.head()

4086


Unnamed: 0,CID,SMILES,IC50(uM),Activity
0,264,CCCC(=O)O,12.0,Unspecified
1,2662,CCCC(=O)O,,Inconclusive
2,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,3.81,Active
3,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,1.2,Active
4,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,6.0,Active


In [145]:
ic50_col_name = "IC50(uM)"
activity_col_name = "Activity"
smiles_col_name = "SMILES"
cid_col_name = "CID"

In [146]:
def show_activity_distribution(label, dataset):
    #Rows for specific labels
    active_rows = dataset.loc[dataset[label] == "Active"]
    inactive_rows = dataset.loc[dataset[label] == "Inactive"]
    inconclusive_rows = dataset.loc[dataset[label] == "Inconclusive"]
    unspecified_rows = dataset.loc[dataset[label] == "Unspecified"]
    dataset_length = len(dataset)
    print("Total dataset")
    table = [['', 'Active', 'Inactive', 'Inconclusive', 'Unspecified'], 
            ['Number', len(active_rows), len(inactive_rows), len(inconclusive_rows), len(unspecified_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100,
            len(inconclusive_rows)/dataset_length*100, len(unspecified_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

## 2. Phân bố Activity

In [147]:
show_activity_distribution(dataset=dataset, label='Activity')
# show_activity_distribution(dataset=dataset, label='ACTIVITY')

Total dataset
╒════════════════╤═══════════╤════════════╤════════════════╤═══════════════╕
│                │    Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪═══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 2873      │  25        │      117       │     1071      │
├────────────────┼───────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │   70.3133 │   0.611845 │        2.86344 │       26.2115 │
╘════════════════╧═══════════╧════════════╧════════════════╧═══════════════╛


## 3. IC50 lỗi

In [148]:
def find_non_float_ic50(dataset, ic50_col_name):
    # Use to_numeric to identify non-float values
    is_float = pd.to_numeric(dataset[ic50_col_name], errors='coerce').notna()
    # Find the rows where 'IC50 (uM)' is not a float
    non_float_rows = dataset[~is_float]
    return non_float_rows

In [149]:
non_float_rows = find_non_float_ic50(dataset, ic50_col_name)
print(len(non_float_rows))
non_float_rows.head()

111


Unnamed: 0,CID,SMILES,IC50(uM),Activity
1,2662,CCCC(=O)O,,Inconclusive
26,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,,Inconclusive
27,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,,Inconclusive
54,5311,C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO,,Inconclusive
55,5311,C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO,,Inconclusive


In [150]:
show_activity_distribution(dataset=non_float_rows, label=activity_col_name)

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │        0 │    8       │        98      │        5      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │        0 │    7.20721 │        88.2883 │        4.5045 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


Các xử lý IC50 với từng loại nhãn
- Active: nếu chứa (<, >), ta sẽ loại bỏ những dấu này đi, nếu chất đó là ô trống, hoặc bằng 0 thì ta loại bỏ.
- Inactive: nếu chứa (<, >), ta sẽ loại bỏ những dấu này đi, nếu chất đó là ô trống, hoặc bằng 0 thì ta loại bỏ.
- Inconclusive: những chất này thường có IC50 bị lỗi, ta loại thẳng những chất này đi.
- Unspecified: Loại bỏ toàn bộ dấu (>, <), nếu chất đó là ô trống, hoặc bằng 0 thì ta loại bỏ

In [152]:
non_float_rows = find_non_float_ic50(dataset, ic50_col_name=ic50_col_name)
print(len(non_float_rows))
non_float_rows.head()

111


Unnamed: 0,CID,SMILES,IC50(uM),Activity
1,2662,CCCC(=O)O,,Inconclusive
26,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,,Inconclusive
27,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,,Inconclusive
54,5311,C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO,,Inconclusive
55,5311,C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO,,Inconclusive


## 4. Nhãn bị mâu thuẫn

In [155]:
def check_label_intersection(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "Active"]
    inactive_rows = dataset.loc[dataset[col_name] == "Inactive"]
    inconclusive_rows = dataset.loc[dataset[col_name] == "Inconclusive"]
    unspecified_rows = dataset.loc[dataset[col_name] == "Unspecified"]
    
    cid_active = active_rows.loc[:, 'SMILES']
    cid_inactive = inactive_rows.loc[:, 'SMILES']
    cid_incon = inconclusive_rows.loc[:, 'SMILES']
    cid_unspec = unspecified_rows.loc[:, 'SMILES']

    ac_inac_cid = np.intersect1d(cid_active, cid_inactive)
    ac_incon_cid = np.intersect1d(cid_active, cid_incon)
    ac_unspec_cid = np.intersect1d(cid_active, cid_unspec)

    inac_incon_cid = np.intersect1d(cid_inactive, cid_incon)
    incon_unspec_cid = np.intersect1d(cid_incon, cid_unspec)
    inac_unspec_cid = np.intersect1d(cid_inactive, cid_unspec)
    
    print("Activity intersection:")
    table = [['Active-Inactive', 'Active-Inconclusive', 'Active-Unspecified', 'Inactive-Inconclusive', 'Inactive-Unspecified', 'Inconclusive-Unspecifid'], 
             [len(ac_inac_cid), len(ac_incon_cid), len(ac_unspec_cid), len(inac_incon_cid), len(inac_unspec_cid), len(incon_unspec_cid)]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [156]:
check_label_intersection(dataset=dataset, col_name='Activity')

Activity intersection:
╒═══════════════════╤═══════════════════════╤══════════════════════╤═════════════════════════╤════════════════════════╤═══════════════════════════╕
│   Active-Inactive │   Active-Inconclusive │   Active-Unspecified │   Inactive-Inconclusive │   Inactive-Unspecified │   Inconclusive-Unspecifid │
╞═══════════════════╪═══════════════════════╪══════════════════════╪═════════════════════════╪════════════════════════╪═══════════════════════════╡
│                 0 │                    13 │                   74 │                       2 │                      2 │                         6 │
╘═══════════════════╧═══════════════════════╧══════════════════════╧═════════════════════════╧════════════════════════╧═══════════════════════════╛


# Loại bỏ, ghép chất và tính trung bình từng chất

## Loại bỏ các chất Inconclusive

In [157]:
def drop_by_activity(dataset, activity_type, col_name):
    drop_rows = dataset.loc[dataset[col_name] == activity_type]
    dataset.drop(drop_rows.index, inplace=True)
    return dataset

In [158]:
dataset = drop_by_activity(dataset=dataset, activity_type='Inconclusive', col_name=activity_col_name)

In [159]:
show_activity_distribution(dataset=dataset, label=activity_col_name)

Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 2873     │  25        │              0 │     1071      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │   72.386 │   0.629882 │              0 │       26.9841 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛


## Loại bỏ những chất không có IC50

Những chất không có IC50 là những chất có IC50 trống hoặc bằng 0

In [160]:
dataset[ic50_col_name] = pd.to_numeric(dataset[ic50_col_name])
dataset = dataset.dropna(subset=[ic50_col_name])
dataset = dataset.drop(dataset.loc[dataset[ic50_col_name] == 0].index)

In [161]:
len(dataset)

3790

In [162]:
show_activity_distribution(dataset=dataset, label=activity_col_name)

Total dataset
╒════════════════╤═══════════╤════════════╤════════════════╤═══════════════╕
│                │    Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪═══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 2729      │   8        │              0 │     1053      │
├────────────────┼───────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │   72.0053 │   0.211082 │              0 │       27.7836 │
╘════════════════╧═══════════╧════════════╧════════════════╧═══════════════╛


In [163]:
non_float_rows = find_non_float_ic50(dataset=dataset, ic50_col_name=ic50_col_name)
non_float_rows.head()

Unnamed: 0,CID,SMILES,IC50(uM),Activity


In [164]:
print(len(dataset.loc[dataset[ic50_col_name] == 0].index))

0


In [165]:
data_ic50_processed = dataset.copy()

## Ghép chất và tính trung bình từng chất

__Kiểm tra nhãn mâu thuẫn__

In [166]:
check_label_intersection(dataset=dataset, col_name=activity_col_name)

Activity intersection:
╒═══════════════════╤═══════════════════════╤══════════════════════╤═════════════════════════╤════════════════════════╤═══════════════════════════╕
│   Active-Inactive │   Active-Inconclusive │   Active-Unspecified │   Inactive-Inconclusive │   Inactive-Unspecified │   Inconclusive-Unspecifid │
╞═══════════════════╪═══════════════════════╪══════════════════════╪═════════════════════════╪════════════════════════╪═══════════════════════════╡
│                 0 │                     0 │                   60 │                       0 │                      2 │                         0 │
╘═══════════════════╧═══════════════════════╧══════════════════════╧═════════════════════════╧════════════════════════╧═══════════════════════════╛


Cách xử lý:
- Nhãn mới: mốc 1uM, nếu lớn hơn thì là Inactive, nếu nhỏ hơn thì là Active. 
- Với những chất Active, ta sẽ thực hiện việc ghép chất xong tính trung bình bình thường, sau đó gán nhãn mới.
- Với những chất Inactive, ta sẽ thực hiện việc ghép chất xong tính trung bình bình thường, sau đó gán nhãn mới.
- Với những chất Unspecified, nếu như chất đó có IC50 > 1 thì ta chuyển thẳng nhãn mới thành Inactive, còn không thì loại bỏ.

__Active row__

In [167]:
def get_new_data_by_activity(dataset, activity, activity_col_name, new_columns, ic50_col_name):
    #Handling the active rows first
    data_rows = dataset.loc[dataset[activity_col_name] == activity]
    new_rows = pd.DataFrame(columns=new_columns)
    for cid in data_rows.CID.unique():
        sub_dataset = data_rows.loc[data_rows.CID == cid]
        avg_ic50, dup_count = 0, 0
        for index, row in sub_dataset.iterrows():
            avg_ic50 = avg_ic50 + float(row[ic50_col_name])/len(sub_dataset)
            dup_count = dup_count + 1
        new_rows.loc[len(new_rows)] = [row.CID, row.SMILES ,avg_ic50, activity,
                                            {True: 'Active', False: 'Inactive'} [avg_ic50 < 1],
                                            dup_count]
    return new_rows

In [168]:
new_columns = ["CID", "SMILES", "AVG_IC50_uM", "FIRST_LABEL", "FINAL_LABEL", "DUPLICATE_COUNTS"]

In [169]:
new_active_rows = get_new_data_by_activity(dataset=dataset, activity='Active', activity_col_name=activity_col_name, new_columns=new_columns, ic50_col_name=ic50_col_name)
print(len(new_active_rows))
new_active_rows.head()

1543


Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,2.2325,Active,Inactive,12
1,3811,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCC(=O)NO,0.24,Active,Active,1
2,3812,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCCC(=O)NO,1.306667,Active,Inactive,3
3,3994,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCC(=O)NO,1.256667,Active,Inactive,3
4,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,1.526316,Active,Inactive,19


__Inactive rows__

In [170]:
inactive_cid_unique = dataset.loc[dataset[activity_col_name] == 'Inactive']
len(inactive_cid_unique.CID.unique())

6

In [171]:
new_inactive_rows = get_new_data_by_activity(dataset=dataset, activity='Inactive', ic50_col_name=ic50_col_name, activity_col_name=activity_col_name, new_columns=new_columns)
print(len(new_inactive_rows))
new_inactive_rows

6


Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,60198344,CN1C=C(C=N1)/C=C/C(=O)NC2=CC=CC=C2N,15.0,Inactive,Inactive,2
1,60198346,CC1=NC(=CO1)/C=C/C(=O)NC2=CC=CC=C2N,20.0,Inactive,Inactive,1
2,60198412,CC1=NOC(=C1)/C=C/C(=O)NC2=CC=CC=C2N,15.0,Inactive,Inactive,2
3,60198482,CN1C=C(C=N1)/C=C/C(=O)NC2=C(C=C(C=C2)F)N,20.0,Inactive,Inactive,1
4,137224531,C1=CC=C2C(=C1)C(=C(N2CC3=CC=C(C=C3)C(=O)NO)O)N=O,30.0,Inactive,Inactive,1
5,155557270,C1=CC=C(C=C1)C[C@H](C(=O)NO)N2C(=C(N=N2)C3=CC=...,20.0,Inactive,Inactive,1


__Unspecified rows__

In [172]:
new_unspecified_rows = get_new_data_by_activity(dataset=dataset, activity='Unspecified', activity_col_name=activity_col_name, ic50_col_name=ic50_col_name, new_columns=new_columns)
print(len(new_unspecified_rows))
new_unspecified_rows.head()

661


Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,264,CCCC(=O)O,12.0,Unspecified,Inactive,1
1,3121,CCCC(CCC)C(=O)O,82.37,Unspecified,Inactive,2
2,3810,CN(C)C1=CC=C(C=C1)C(=O)NCCCCC(=O)NO,10.0,Unspecified,Inactive,2
3,3811,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCC(=O)NO,10.0,Unspecified,Inactive,1
4,4775,C1=CC=C(C=C1)CCCC(=O)O,65.0,Unspecified,Inactive,1


__Combine all three to create a new data__

In [173]:
new_data = pd.DataFrame(columns=new_columns)
new_data = pd.concat([new_data, new_active_rows], axis=0)
new_data = pd.concat([new_data, new_inactive_rows], axis=0)
new_data = pd.concat([new_data, new_unspecified_rows], axis=0)
print(len(new_active_rows), len(new_inactive_rows), len(new_unspecified_rows), len(new_data))

1543 6 661 2210


In [174]:
new_data.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,2.2325,Active,Inactive,12
1,3811,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCC(=O)NO,0.24,Active,Active,1
2,3812,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCCC(=O)NO,1.306667,Active,Inactive,3
3,3994,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCC(=O)NO,1.256667,Active,Inactive,3
4,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,1.526316,Active,Inactive,19


In [175]:
show_activity_distribution(dataset=new_data, label='FINAL_LABEL')

Total dataset
╒════════════════╤═══════════╤════════════╤════════════════╤═══════════════╕
│                │    Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪═══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 1102      │  1108      │              0 │             0 │
├────────────────┼───────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │   49.8643 │    50.1357 │              0 │             0 │
╘════════════════╧═══════════╧════════════╧════════════════╧═══════════════╛


In [176]:
check_label_intersection(dataset=new_data, col_name='FIRST_LABEL')

Activity intersection:
╒═══════════════════╤═══════════════════════╤══════════════════════╤═════════════════════════╤════════════════════════╤═══════════════════════════╕
│   Active-Inactive │   Active-Inconclusive │   Active-Unspecified │   Inactive-Inconclusive │   Inactive-Unspecified │   Inconclusive-Unspecifid │
╞═══════════════════╪═══════════════════════╪══════════════════════╪═════════════════════════╪════════════════════════╪═══════════════════════════╡
│                 0 │                     0 │                   60 │                       0 │                      2 │                         0 │
╘═══════════════════╧═══════════════════════╧══════════════════════╧═════════════════════════╧════════════════════════╧═══════════════════════════╛


# Lọc trùng dữ liệu mới

In [34]:
# data_before_cannon_smiles = new_data.copy()
# from rdkit.Chem.MolStandardize import rdMolStandardize
# def make_canonical_smiles(smiles):
#     smiles = [rdMolStandardize.StandardizeSmiles(smi) for smi in smiles]
#     return smiles
# cannon_smiles = make_canonical_smiles(new_data.SMILES)
# new_data['SMILES'] = cannon_smiles

In [177]:
def remove_dup_smiles(dataset, smile_col_name):
    unique_result = pd.DataFrame(columns=dataset.columns)
    dup_result = pd.DataFrame(columns=dataset.columns)
    error_result = pd.DataFrame(columns=dataset.columns)
    #Start iteration
    unique_smiles = dataset[smile_col_name].unique()
    for smile in unique_smiles:
        sub_dataset = dataset.loc[dataset[smile_col_name] == smile]
        if(len(sub_dataset) == 1):
            unique_result = pd.concat([unique_result, sub_dataset], axis=0)
        elif(len(sub_dataset) > 1):
            dup_result = pd.concat([dup_result, sub_dataset], axis=0)
        else:
            error_result = pd.concat([error_result, sub_dataset], axis=0)
    return unique_result, dup_result, error_result

In [178]:
unique_result, dup_result, error_result = remove_dup_smiles(dataset=new_data, smile_col_name='SMILES')

In [179]:
print(len(unique_result), len(dup_result), len(error_result))

2086 124 0


In [180]:
new_data = unique_result
print(len(new_data))
new_data.head()

2086


Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
0,2746,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,2.2325,Active,Inactive,12
2,3812,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCCC(=O)NO,1.306667,Active,Inactive,3
3,3994,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCCC(=O)NO,1.256667,Active,Inactive,3
4,4261,C1=CC=C(C(=C1)N)NC(=O)C2=CC=C(C=C2)CNC(=O)OCC3...,1.526316,Active,Inactive,19
6,5173,C(CCCC(=O)NO)CCC(=O)NO,8.23,Active,Inactive,1


In [181]:
dup_result.head()

Unnamed: 0,CID,SMILES,AVG_IC50_uM,FIRST_LABEL,FINAL_LABEL,DUPLICATE_COUNTS
1,3811,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCC(=O)NO,0.24,Active,Active,1
3,3811,CN(C)C1=CC=C(C=C1)C(=O)NCCCCCC(=O)NO,10.0,Unspecified,Inactive,1
5,4996,C1=CC(=CN=C1)NC(=O)CCCCCCC(=O)NO,4.05,Active,Inactive,2
5,4996,C1=CC(=CN=C1)NC(=O)CCCCCCC(=O)NO,10.0,Unspecified,Inactive,1
12,298878,C1=CC=C(C=C1)C(=O)CCCCCCC(=O)O,2.185,Active,Inactive,2


# Đánh giá bộ dữ liệu

In [182]:
check_label_intersection(dataset=new_data, col_name='FIRST_LABEL')

Activity intersection:
╒═══════════════════╤═══════════════════════╤══════════════════════╤═════════════════════════╤════════════════════════╤═══════════════════════════╕
│   Active-Inactive │   Active-Inconclusive │   Active-Unspecified │   Inactive-Inconclusive │   Inactive-Unspecified │   Inconclusive-Unspecifid │
╞═══════════════════╪═══════════════════════╪══════════════════════╪═════════════════════════╪════════════════════════╪═══════════════════════════╡
│                 0 │                     0 │                    0 │                       0 │                      0 │                         0 │
╘═══════════════════╧═══════════════════════╧══════════════════════╧═════════════════════════╧════════════════════════╧═══════════════════════════╛


In [183]:
check_label_intersection(dataset=new_data, col_name='FINAL_LABEL')

Activity intersection:
╒═══════════════════╤═══════════════════════╤══════════════════════╤═════════════════════════╤════════════════════════╤═══════════════════════════╕
│   Active-Inactive │   Active-Inconclusive │   Active-Unspecified │   Inactive-Inconclusive │   Inactive-Unspecified │   Inconclusive-Unspecifid │
╞═══════════════════╪═══════════════════════╪══════════════════════╪═════════════════════════╪════════════════════════╪═══════════════════════════╡
│                 0 │                     0 │                    0 │                       0 │                      0 │                         0 │
╘═══════════════════╧═══════════════════════╧══════════════════════╧═════════════════════════╧════════════════════════╧═══════════════════════════╛


In [184]:
print("Label for first label:")
show_activity_distribution(dataset=new_data, label='FIRST_LABEL')
print()
print("Label for final label:")
show_activity_distribution(dataset=new_data, label='FINAL_LABEL')

Label for first label:
Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 1483     │   4        │              0 │      599      │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage (%) │   71.093 │   0.191755 │              0 │       28.7152 │
╘════════════════╧══════════╧════════════╧════════════════╧═══════════════╛

Label for final label:
Total dataset
╒════════════════╤══════════╤════════════╤════════════════╤═══════════════╕
│                │   Active │   Inactive │   Inconclusive │   Unspecified │
╞════════════════╪══════════╪════════════╪════════════════╪═══════════════╡
│ Number         │ 1059     │   1027     │              0 │             0 │
├────────────────┼──────────┼────────────┼────────────────┼───────────────┤
│ Percentage 

In [185]:
non_float_rows = find_non_float_ic50(dataset, ic50_col_name=ic50_col_name)
print(len(non_float_rows))
non_float_rows.head()

0


Unnamed: 0,CID,SMILES,IC50(uM),Activity


In [73]:
with pd.ExcelWriter('../../data_for_modeling/filter_data/all_data/HDAC2_all_data_filtered.xlsx', engine='openpyxl') as writer:
    writer.book = writer.book 
    dataset_c.to_excel(writer, sheet_name='original_data', index=False)
    new_data.to_excel(writer, sheet_name='filter_data', index=False)
    dup_result.to_excel(writer, sheet_name='duplicate_smiles', index=False)
    # data_before_cannon_smiles.to_excel(writer, sheet_name='data_before_cannon_smiles', index=False)
    data_ic50_processed.to_excel(writer, sheet_name="data_ic50_process", index=False)

  writer.book = writer.book
