In [1]:
import os, sys
from pathlib import Path
home = str(Path.home())
core_dir = home+'/repositories/ai-x/core'
sys.path.insert(0, core_dir)
conf_dir = core_dir+"/keywords"
sys.path.insert(0, conf_dir)

from filters_dop import *
import pandas as pd
import matplotlib.pyplot as plt 
# from misc import check_output_dir
# output_dir = "output_dir"
# check_output_dir(output_dir, keep_old = False)

def check_output_dir(folder_name):
    """
    Checks if the specified folder exists, and creates it if it doesn't.
    
    Parameters:
    folder_name (str): The name of the folder to check/create.
    """
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"Folder '{folder_name}' created.")
    else:
        print(f"Folder '{folder_name}' already exists.")
        
check_output_dir("output_dir")

filters package has been imported!
Folder 'output_dir' already exists.


In [2]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout
        

# Updating this function -- commenting out Bao_Format to inspect it further + not saving dataset
def get_filtered_datasets(chembl_tsv_file, standard_type, target, assaydefinition, output_dir, base_name, broad=False,
                 before=False):
    "MAIN FUNCTION used in run_filters.py"
    with HiddenPrints():
        buffer = read_data(chembl_tsv_file, Verbose=True)
        buffer = filter_confidence(buffer, broad, Verbose=True)
        buffer = filter_assay_type(buffer, target=target, assaydefinition=assaydefinition, Verbose=True)
        if standard_type == "Ki":
            buffer = filter_affinity(buffer, Verbose=True, keepIC50=False, keepKi=True)
        if standard_type == "IC50":
            buffer = filter_affinity(buffer, Verbose=True, keepIC50=True, keepKi=False)
        buffer = filter_units(buffer, Verbose=True)
        buffer = filter_exact(buffer, Verbose=True)

        if "D2" in target or "D3" in target:
            if assaydefinition == "agonist":
                keys = ["agonist"]
            elif assaydefinition == "antagonist":
                keys = ["antagonist"]
            elif assaydefinition == "others":
                keys = ["others"]
            for key in keys:
                filtered_in, filtered_out = filter_assaydefinition(buffer, target, key, Verbose=False)
                buffer = filtered_in
            print("Number of compounds after Displacement Assay filter:", len(buffer))
            print("Number of compounds after removing testset 2 compounds:  n/a")

        buffer = filter_year(buffer, target, year=1990, Verbose=True)
        #buffer = filter_bao_format(buffer, target, assaydefinition, Verbose=True)
        buffer = filter_selected(buffer, target, assaydefinition, Verbose=True)

        buffer = filter_small_sets(buffer, Verbose=True, threshold=4)
        buffer = filter_salts(buffer, conf_dir, Verbose=True)
        buffer = filter_elements(buffer, Verbose=True)
        buffer = filter_size(buffer, Verbose=True)
        buffer = filter_pchembl_values(buffer, Verbose=True, replace=True)
        buffer = filter_weirdos(buffer, Verbose=True)
        buffer = deduplicate_mols(buffer, Verbose=True)
    print(len(buffer))

    return buffer


def get_pieChart(_xx, save2png=""):
    dict_BAO = {}
    dict_format = {}
    dict_BAO_to_format = {'BAO_0000219': 'cell-based', 'BAO_0000221': 'tissue-based',
                          'BAO_0000357': 'single protein', 'BAO_0000249': 'cell membrane', 'BAO_0000251': 'microsome',
                         'BAO_0000019': 'assay'}
    dict_colors = {'cell-based': 'lightskyblue', 'tissue-based': 'lightcoral', 'single protein': 'dodgerblue',
                   'cell membrane': 'limegreen', 'microsome': 'gold', 'assay': 'orange'}
    for BAO in _xx['bao_format'].unique():
        num = len(_xx[_xx['bao_format']==BAO])
        dict_BAO[BAO] = num
        formats = dict_BAO_to_format[BAO]
        dict_format[formats] = num
    labels = list(dict_format.keys())
    values = list(dict_format.values())

    def make_autopct(values):
        def my_autopct(pct):
            total = sum(values)
            val = int(round(pct*total/100.0))
            return '{p:.0f}%  ({v:d})'.format(p=pct,v=val)
        return my_autopct
    colours = dict(zip(labels, plt.cm.tab10.colors[:len(labels)]))

    ls_colors = []
    for label in labels:
        ls_colors.append(dict_colors[label])
    if save2png:
        plt.pie(values, labels=None, colors = ls_colors, autopct=None, textprops={'fontsize': 0})
    else:
        plt.pie(values, labels=labels, colors = ls_colors, autopct=make_autopct(values), textprops={'fontsize': 13})
#     plt.title(title+': BAO Proportions')
    fig = plt.gcf()
    size = 6
    fig.set_size_inches(size, size)
    plt.tight_layout()
    
    if save2png:
        plt.savefig(save2png, transparent = True, dpi=300, bbox_inches='tight')
    else:
        plt.legend(loc = 'upper right', bbox_to_anchor= (1.35,1))

    plt.show()

In [3]:
def get_df(target):
    chembl = "33"
    base_name = 'pubdata'
    standard_type = "Ki"
    assaydefinition = "antagonist"

    df = pd.DataFrame()
    chembl_tsv_file = home+"/repositories/ai-DR/datasets/"+f"pgsql/all_pgsql/chembl{chembl}_{target}.tsv"
    output_dir = f"new_datasets/C{chembl}/dataset_{target}_{assaydefinition}_{standard_type}"
    df = get_filtered_datasets(chembl_tsv_file, standard_type, target, assaydefinition, output_dir, base_name)
    return df

In [None]:
d2 = get_df("D2")

In [None]:
d3 = get_df("D3")

In [None]:
save2png = f"output_dir/BAO_D2.png"
get_pieChart(d2, save2png="")
get_pieChart(d2, save2png)

In [None]:
save2png = f"output_dir/BAO_D3.png"
get_pieChart(d3, save2png="")
get_pieChart(d3, save2png)