In [1]:
import os, sys
from pathlib import Path
home = str(Path.home())
core_dir = home + '/repositories/ai-x/core'
conf_dir = core_dir + "/conf"
sys.path.insert(0, core_dir)
sys.path.insert(0, conf_dir)

from filters import *
from buildmodel import *
from misc import *
import io
import csv 


def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)


def target_dict(filename):  
    '''Gets FULL list of lines '''
    # original data 
    ls_file = []
    with open(filename) as file:
        tsv_file = csv.reader(file, delimiter="\t")
        for line in tsv_file:
            line = line[0]
            ls_file.append(line)
    return ls_file


def get_tids(ls_keys, ls_file):
    '''Retrieves the lines that have matching keyterms as ls_keys
    End goal - Gets D2 or D3 specific TIDs '''
    # dopamine 2 
    ls_lines = []
    for line in ls_file:
        for key in ls_keys:
            if key in line: 
                ls_lines.append(line)
                break # move onto next line in datafile if you appended this one
    return ls_lines


def skip_previous_tids(keys_dop):
    """List the keys that you want to skip"""
    # filtering out the original dataset to omit the ones with keys from above
    ls_lines = []
    for line in ls_file:
        count = 0
        for key in keys_dop:
            if key in line: 
                count += 1 
        if count == 0:
            ls_lines.append(line)
    return ls_lines
            
    
def tid_to_table(datadir, tid):
    chembl_tsv_file = f"{datadir}/test_{tid}_clean.tsv"
    df = read_data(chembl_tsv_file, Verbose = True)
    print("TID:", tid, ", Pref Name Target:", set(df['pref_name_target']))

    _ls_doi = list(set(df['doi']))
    ls_doi = [x for x in _ls_doi if str(x) != 'nan'] # remove "nan" in ls_doi
    ls_year = []
    ls_journal = []
    for idoi in ls_doi:
        iyear = list(set(df['year'][df['doi']==idoi]))[0]
        ijournal = list(set(df['journal'][df['doi']==idoi]))[0]
        ls_year.append(iyear)
        ls_journal.append(ijournal)
    #     print(iyear, ijournal,"\t", "\t", doi_org+idoi)

    doi_org='https://doi.org/'
    df2 = pd.DataFrame({"Year": ls_year, "Journal": ls_journal, "DOI:": [doi_org+idoi for idoi in ls_doi]})

    df2_click = df2.style.format({'DOI:': make_clickable})
    df2.style.format(make_clickable)
    
    return df2_click

# Each targets

In [2]:
# ls_file contains ALL lines in the tsv file
filename = 'target_dict.tsv'
ls_file = target_dict(filename)

# D2 
keys_d2 = ['Dopamine 2', 'Dopamine2', 'D2']
ls_d2 = get_tids(keys_d2, ls_file)

# D3
keys_d3 = ['Dopamine 3', 'Dopamine3', 'D3']
ls_d3 = get_tids(keys_d3, ls_file)

# D2 and D3
keys_dop = keys_d2+keys_d3
ls_lines = skip_previous_tids(keys_dop)
keys_d23 = ['Dopamine', 'dopamine']
ls_d23 = get_tids(keys_d23, ls_lines)

In [4]:
ls_d2

['(72,"SINGLE PROTEIN","Dopamine D2 receptor",9606,"Homo sapiens",CHEMBL217,0)',
 '(181,"SINGLE PROTEIN","B-lymphocyte antigen CD20",9606,"Homo sapiens",CHEMBL2058,0)',
 '(196,"SINGLE PROTEIN","T-cell surface antigen CD2",9606,"Homo sapiens",CHEMBL2040,0)',
 '(10307,"SINGLE PROTEIN","Cytochrome P450 2D2",10116,"Rattus norvegicus",CHEMBL2483,0)',
 '(11344,"SINGLE PROTEIN","Dopamine D2 receptor",9615,"Canis lupus familiaris",CHEMBL2703,0)',
 '(11426,"SINGLE PROTEIN","Dopamine D2 receptor",9913,"Bos taurus",CHEMBL3998,0)',
 '(11427,"SINGLE PROTEIN","Dopamine D2 receptor",10090,"Mus musculus",CHEMBL3427,0)',
 '(11695,"SINGLE PROTEIN",CD22,9606,"Homo sapiens",CHEMBL3218,0)',
 '(11966,"SINGLE PROTEIN","Phospholipase D2",9606,"Homo sapiens",CHEMBL2734,0)',
 '(14037,"SINGLE PROTEIN","Dopamine D2 receptor",10116,"Rattus norvegicus",CHEMBL339,0)',
 '(19654,"SINGLE PROTEIN","Prostaglandin D2 synthase",9913,"Bos taurus",CHEMBL4651,0)',
 '(80247,CELL-LINE,MDAY-D2,10090,"Mus musculus",CHEMBL614337,0

In [5]:
ls_d3

['(81,"SINGLE PROTEIN","Myeloid cell surface antigen CD33",9606,"Homo sapiens",CHEMBL1842,0)',
 '(130,"SINGLE PROTEIN","Dopamine D3 receptor",9606,"Homo sapiens",CHEMBL234,0)',
 '(233,"SINGLE PROTEIN","T-cell surface glycoprotein CD3 epsilon chain",9606,"Homo sapiens",CHEMBL1975,0)',
 '(11429,"SINGLE PROTEIN","Dopamine D3 receptor",10090,"Mus musculus",CHEMBL3441,0)',
 '(11430,"SINGLE PROTEIN","Dopamine D3 receptor",10116,"Rattus norvegicus",CHEMBL3138,0)',
 '(12414,"SINGLE PROTEIN","Vitamin D3 receptor-interacting protein",9606,"Homo sapiens",CHEMBL4146,0)',
 '(12709,"SINGLE PROTEIN","G1/S-specific cyclin D3",9606,"Homo sapiens",CHEMBL2422,0)',
 '(13082,"SINGLE PROTEIN","Cytochrome P450 2D3",10116,"Rattus norvegicus",CHEMBL3057,0)',
 '(17127,"SINGLE PROTEIN","Vitamin D3 receptor",9825,"Sus scrofa domesticus",CHEMBL2648,0)',
 '(80091,CELL-LINE,CHO-hD3,10029,"Cricetulus griseus",CHEMBL614557,0)',
 '(80810,CELL-LINE,D34,,,CHEMBL614477,0)',
 '(80811,CELL-LINE,"D341 cell line",9606,"Homo s

Summarized information from above:

In [3]:
# If you want to look into specific tids, you can do so using this.
# Make sure the tid test files are located here: 
# "/home/sjwon3789/repositories/ai-DR/statistics/test_pgsql/test_tids"

# Instructions to generate each tids are in that folder. 

# datadir = 'test_tids'
# df = tid_to_table(datadir, 72)
# df