In [1]:
import sys, os
from pathlib import Path
home = str(Path.home())
core_dir = home+'/repositories/ai-x/core'
conf_dir = core_dir+"/keywords"
sys.path.insert(0, core_dir)
sys.path.insert(0, conf_dir)

from filters_dop import *
import re

def get_filtered_datasets(chembl_tsv_file, standard_type, target, assaydefinition, output_dir, base_name, broad=False,
                 before=False):
    "MAIN FUNCTION used in run_filters.py"
    # store data into "buffer" that's being *printed*
    with io.StringIO() as buf, redirect_stdout(buf):
        buffer = read_data(chembl_tsv_file, Verbose=True)
        buffer = filter_confidence(buffer, broad, Verbose=True)
        buffer = filter_assay_type(buffer, target=target, assaydefinition=assaydefinition, Verbose=True)
        if standard_type == "Ki":
            buffer = filter_affinity(buffer, Verbose=True, keepIC50=False, keepKi=True)
        if standard_type == "IC50":
            buffer = filter_affinity(buffer, Verbose=True, keepIC50=True, keepKi=False)
        buffer = filter_units(buffer, Verbose=True)
        buffer = filter_exact(buffer, Verbose=True)

        if "D2" in target or "D3" in target:
            if assaydefinition == "agonist":
                keys = ["agonist"]
            elif assaydefinition == "antagonist":
                keys = ["antagonist"]
            elif assaydefinition == "others":
                keys = ["others"]
            for key in keys:
                filtered_in, filtered_out = filter_assaydefinition(buffer, target, key, Verbose=False)
                buffer = filtered_in
            print("Number of compounds after Displacement Assay filter:", len(buffer))
            print("Number of compounds after removing testset 2 compounds:  n/a")

        buffer = filter_year(buffer, target, year=1990, Verbose=True)
        buffer = filter_bao_format(buffer, target, assaydefinition, Verbose=True)
        buffer = filter_selected(buffer, target, assaydefinition, Verbose=True)

        buffer = filter_small_sets(buffer, Verbose=True, threshold=4)
        buffer = filter_salts(buffer, conf_dir, Verbose=True)
        buffer = filter_elements(buffer, Verbose=True)
        buffer = filter_size(buffer, Verbose=True)
        buffer = filter_pchembl_values(buffer, Verbose=True, replace=True)
        buffer = filter_weirdos(buffer, Verbose=True)
        buffer = deduplicate_mols(buffer, Verbose=True)

        check_output_dir(output_dir, keep_old=False)
        write_smi_act_reg(buffer, base_name, output_dir, add_extra=False)
        write_smi_act_class(buffer, base_name, output_dir, inact_val=5.0, act_val=6.0, Verbose=True)

        # capture all the printed lines into 'buffer'
        output = buf.getvalue()

    with open(output_dir + '.log', 'w') as file:
        file.write(output.replace('\n\n', '\n'))

    return output


filters package has been imported!


In [2]:
%%time
targets = ["D2", "D3"]
chembls = ["34"]
base_name = 'pubdata'
standard_types = ["Ki", "IC50"]
assaydefinitions = ["antagonist", "agonist"]
df = pd.DataFrame()
for chembl in chembls:
    df = pd.DataFrame()
    for target in targets:
        chembl_tsv_file = home+"/repositories/ai-DR/datasets/"+f"pgsql/all_pgsql/chembl{chembl}_{target}.tsv"
        for assaydefinition in assaydefinitions:
            for standard_type in standard_types:
                output_dir = f"new_datasets/C{chembl}/dataset_{target}_{assaydefinition}_{standard_type}"
                output = get_filtered_datasets(chembl_tsv_file, standard_type, target, assaydefinition, output_dir, base_name)
                ## if you want to save this to excel, we can make a df
                df = get_dataframe(output, df, target, standard_type, assaydefinition)
                print(f"Dataset created: chembl = {chembl}, target = {target}, assaydefinition = {assaydefinition},"
                      f" standard_type = {standard_type}")
                print(output)
                save_to_excel(df, chembl, "output_dir")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  active_mols['pchembl_value_class'] = np.ones(len(active_mols), dtype=int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inactive_mols['pchembl_value_class'] = np.zeros(len(inactive_mols), dtype=int)


Dataset created: chembl = 34, target = D2, assaydefinition = antagonist, standard_type = Ki
Number of pharmacological activity at starting:  31523
Number of pharmacological activity after confidence score filter:  18682
Number of compounds after assay type filter:  14010
Number of pharmacological activity after Ki / IC50 / EC50 filter:  9284
Number of pharmacological activity after standard units filter:  9229
Number of pharmacological activity after activity relationship type fixes:  8350
Number of compounds after Displacement Assay filter: 6530
Number of compounds after removing testset 2 compounds:  n/a
Number of compounds after 1990 year filter:  6460
Number of compounds after BAO_FORMAT filter:  5625
Number of compounds after patent & hand selecting (paper) filter:  5520
Number of pharmacological activity after data set size filter:  5374
Number of pharmacological activity after desalting pass:  5371
Number of pharmacological activity after oddball element filter:  5361
Number of 

Dataset created: chembl = 34, target = D3, assaydefinition = agonist, standard_type = Ki
Number of pharmacological activity at starting:  15039
Number of pharmacological activity after confidence score filter:  8581
Number of compounds after assay type filter:  8581
Number of pharmacological activity after Ki / IC50 / EC50 filter:  5569
Number of pharmacological activity after standard units filter:  5562
Number of pharmacological activity after activity relationship type fixes:  5237
Number of compounds after Displacement Assay filter: 749
Number of compounds after removing testset 2 compounds:  n/a
Number of compounds after 1990 year filter:  749
Number of compounds after BAO_FORMAT filter:  749
Number of compounds after patent & hand selecting (paper) filter:  746
Number of pharmacological activity after data set size filter:  719
Number of pharmacological activity after desalting pass:  718
Number of pharmacological activity after oddball element filter:  718
Number of pharmacologi

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6,7
,D2,D2,D2,D2,D3,D3,D3,D3
Starting data,31523,31523,31523,31523,15039,15039,15039,15039
After confidence score filter,18682,18682,18682,18682,8581,8581,8581,8581
After assay type filter,14010,14010,18682,18682,7059,7059,8581,8581
,Ki,IC50,Ki,IC50,Ki,IC50,Ki,IC50
After Ki / IC50 filter,9284,934,9922,1496,5137,255,5569,362
After standard units filter,9229,934,9866,1449,5130,244,5562,351
After activity relationship type fixes,8350,673,8836,1003,4825,198,5237,304
,,,,,,,,
,,,,,,,,
