### Curation pipeline

In [None]:
#import functions
import sys,os
import glob
try: 
    if(cwd is not None):
        from functions.utils_curation import *
except:
    %cd ..
    cwd = os.getcwd()
    sys.path.insert(0,cwd)
    from functions.utils_curation import *
    

#### Data preparation and standardisation

In [None]:
#load dataset
df0 = check_extention('single-protein-assay-ic50-only.csv', 1)

#save summary
row = ['inital', len(df0)]
with open('./data/data_summary/preparation_summary.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)

df0.head()

In [None]:
#drop missing activity values
df0 = df0.dropna(subset=['Standard Value'])

#save summary
row = ['after missing activity removed', len(df0)]
with open('./data/data_summary/preparation_summary.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)
df0 = df0.reset_index(drop=True)
df0.head()

In [None]:
#check if every activity unit is the same
df0.groupby('Standard Units').size()

In [None]:
#remove unwanted activity data
diffact = []
DropList = []
for index, activity in enumerate(df0['Standard Units']):
    if str(activity).lower() == 'nm':
        pass
    else:
        diffact.append(df0.iloc[[index]])
        DropList.append(index)

if len(diffact) == 0:
    row = ["after different activity removed", len(df0)]
    with open('./data/data_summary/preparation_summary.csv','a') as f:
        writer = csv.writer(f)
        writer.writerow(row)
else:
    df0 = df0.drop(DropList, errors="ignore")
    diffact = pd.concat(diffact)
    diffact.to_csv('./data/removed_during_curation/different_activity.csv')
    df0 = df0.reset_index()
    row = ["after different activity removed", len(df0)]
    with open('./data/data_summary/preparation_summary.csv','a') as f:
        writer = csv.writer(f)
        writer.writerow(row)

In [None]:
#remove unwanted columns
dropList = ['Standard Units']
df0 = df0.drop(columns = dropList)
df0.head()

In [None]:
""" Optional step, make sure to adapt the code to your dataset """
#convert IC50 to pIC50
pIC50 = []
for value in df0['Standard Value']:
    value = value / 1000
    pIC50.append(-(math.log10(value*10**-6)))

df0['pIC50 (uM)'] = pIC50
df0.head()

In [None]:
#rename columns if needed
df0.rename(columns = {'Molecule ChEMBL ID':'ID', 'Smiles':'SMILES', 'Standard Relation':'Relation', 'Standard Value': 'IC50 (uM)'}, inplace = True)
df0.head()

In [None]:
#structural curation
curated_dataset = curate(df0, save_data)

#### Duplicate removal binary

In [None]:
#load standardised data
fname = 'standardised_but_no_duplicates_removed.csv'
df1 = check_extention('standardised_but_no_duplicates_removed.csv', 2)

# summary
row = ["initial", len(df1)]
with open('./data/data_summary/binary_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)

df1.head()

In [None]:
#check types of relation
df1.groupby('Relation').size()

In [None]:
df1 = relationTreat(dataset = df1, relationcolumn = 'Relation', activitycolumn = 'IC50 (uM)', threshold = 10, curationtype = 'binary')
df1.head()

In [None]:
#define threshold
outcome = [1 if activity > 5 else 0 for activity in df1['pIC50 (uM)']]
df1['Outcome'] = outcome
df1

In [None]:
#group duplicates based on inchikey
df1_agg = group(df1, ['pIC50 (uM)', 'ID', 'SMILES', 'Outcome'])

# summary
row = ["duplicates total", len(df1) - len(df1_agg)]
with open('./data/data_summary/binary_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)
df1_agg.head()

In [None]:
#remove duplicates with stddev > 0
df2_agg = dupRemovalClassification(df1_agg, 'Outcome', 'binary')

# summary
row = ["discordant duplicates", (len(df1_agg) - len(df2_agg))]
with open('./data/data_summary/binary_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)
df2_agg.head()

In [None]:
df1 = removeListedValues(df2_agg)
df1

In [None]:
#save curated data
df1.to_csv(f'./data/curated_data/curated_binary.csv', index=False)

#### Duplicate removal regression

In [None]:
#load standardised data
fname = 'standardised_but_no_duplicates_removed.csv'
df2 = check_extention(fname, 2)

# summary
row = ["initial", len(df2)]
with open('./data/data_summary/regression_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)

df2.head()

In [None]:
#remove relations != "="

index_drop = []
for index, relation in enumerate(df2['Relation']):
    if relation != "'='":
        index_drop.append(index)
    else:
        pass

if len(index_drop) == 0:
    pass
else:
    mask = df2.iloc[index_drop]
    df2_removed = df2.drop(index_drop, errors="ignore").reset_index(drop = True)
    mask.to_csv("{}relationsRemoved_regression.csv".format(errorverbose), sep=',', header=True, index=False)

# summary
row = ["removed relations != '='", len(df2) - len(df2_removed)]
with open('./data/data_summary/regression_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)
df2_removed.head()

In [None]:
#check relations
df2_removed.groupby('Relation').size()

In [None]:
#Drop relations column
df2 = df2_removed.drop(columns = 'Relation')
df2.head()

In [None]:
#group duplicates based on inchikey
df2_agg = group(df2, ['pIC50 (uM)', 'ID', 'SMILES'])

# summary
row = ["duplicates total", len(df2) - len(df2_agg)]
with open('./data/data_summary/regression_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)

In [None]:
df3_agg = dupRemovalRegression(df2_agg, errorverbose, 'pIC50 (uM)', 0.2)

# summary
row = ["discordant duplicates", len(df2_agg) - len(df3_agg)]
with open('./data/data_summary/regression_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)

df3_agg   

In [None]:
df2 = removeListedValues(df3_agg)
df2

In [None]:
#save curated data
df2.to_csv(f'{save_data}curated_regression.csv', index=False)

#### Duplicate removal multiclass

In [None]:
#load standardised data
fname = 'standardised_but_no_duplicates_removed.csv'
df3 = check_extention(fname, 2)
# summary
row = ["initial", len(df3)]
with open('./data/data_summary/multiclass_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)
df3.head()

In [None]:
#check types of relation
df3.groupby('Relation').size()

In [None]:
df3 = relationTreat(dataset = df3, relationcolumn = 'Relation', activitycolumn = 'IC50 (uM)', threshold = 10, curationtype = 'multiclass')
df3.head()

In [None]:
#define threshold
outcome = []

for activity in df3['pIC50 (uM)']:
    #nonblocker
    if activity < 4.5:
        outcome.append(0)
    elif activity < 5 and activity >= 4.5:
        outcome.append(1)
    elif activity >= 5 and activity < 6:
        outcome.append(2)
    else:
        outcome.append(3)
df3['Outcome'] = outcome
df3

In [None]:
#group duplicates based on inchikey
df3_agg = group(df3, ['pIC50 (uM)', 'ID', 'SMILES', 'Outcome'])

# summary
row = ["duplicates total", len(df3) - len(df3_agg)]
with open('./data/data_summary/multiclass_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)

df3_agg.head()

In [None]:
#remove duplicates with stddev > 0
df4_agg = dupRemovalClassification(df3_agg, 'Outcome', 'multiclass')

# summary
row = ["discordant duplicates", len(df3_agg) - len(df4_agg)]
with open('./data/data_summary/multiclass_dupremoval.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(row)

df4_agg.head()

In [None]:
df3 = removeListedValues(df4_agg)
df3

In [None]:
#save curated data
df3.to_csv(f'{save_data}curated_multiclass.csv', index=False)