In [1]:
import numpy as np
import pandas as pd
import datetime
import os, sys
import scipy.spatial.distance as dist

data downloaded 6-6-17

data source: http://geneontology.org/gene-associations/goa_human.gaf.gz

In [2]:
depth = 4 # the minimum depth down the ontology tree a term must have to be included 
min_genes = 5 # the minimum number of genes that must be present to be included in a gene library
min_terms = 5 # the minimum number of terms that must be present to be included in a attribute library
max_genes = 2000 # the maximum number of genes that are present to be included in a gene library
max_terms = 2000 # the maximum number of terms that are present to be included in a attribute library

### The GO annotation dataset

In [3]:
go_human = pd.read_csv('input/goa_human.gaf', sep='\t', skiprows=34, header=None, index_col=False)

In [4]:
go_human.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,UniProt,,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,


In [5]:
go_human.shape

(417076, 17)

### The mapping of the GO ID's to the GO term with reference to ansestor terms

sorce: http://amigo.geneontology.org/amigo/search/ontology?q=*:*&fq=idspace:%22GO%22&fq=is_obsolete:%22false%22&sfq=document_category:%22ontology_class%22

In [6]:
go_terms_col = ['ID', 'TERM', 'ANCESTOR']

In [7]:
go_tems = pd.read_csv('input/GO_TERMS.tsv', sep='\t', header=None)

In [8]:
go_tems.columns = go_terms_col

In [9]:
go_tems.head()

Unnamed: 0,ID,TERM,ANCESTOR
0,GO:0102675,C4-methyltransferase activity,GO:0008150|GO:0003824|GO:0008152|GO:0102675|GO...
1,GO:0102674,C4-demethylase activity,GO:0008150|GO:0003824|GO:0008152|GO:0008168|GO...
2,GO:0102677,"campesterol,NADPH:oxygen oxidoreductase activity",GO:0008150|GO:0102677|GO:0016705|GO:0003824|GO...
3,GO:0102676,avenasterol-desaturase activity,GO:0008150|GO:0102676|GO:0003824|GO:0016705|GO...
4,GO:0102671,6a-hydroxymaackiain-3-O-methyltransferase acti...,GO:0008150|GO:0003824|GO:0008152|GO:0008168|GO...


In [10]:
go_tems.set_index('ID', inplace=True)

### The mapping of gene ID's and symbol

In [11]:
gene_mapping = pd.read_csv('input/gene_id_mapping.txt', sep='\t', index_col=2)

In [12]:
gene_mapping.head()

Unnamed: 0_level_0,Approved Symbol,Entrez Gene ID
UniProt ID(supplied by UniProt),Unnamed: 1_level_1,Unnamed: 2_level_1
P04217,A1BG,1.0
,A1BG-AS1,503538.0
Q9NQ94,A1CF,29974.0
P01023,A2M,2.0
,A2M-AS1,144571.0


In [13]:
gene_mapping.dropna(how='any', inplace=True)

### Biological Process (P)

In [14]:
bp_go_human = go_human[go_human[8]=='P'].copy()

In [15]:
bp_go_human.shape

(140959, 17)

filter data by eveidance type (currently not in use)

In [16]:
bp_go_human = bp_go_human[~bp_go_human[6].isin(['IEA'])]

In [17]:
bp_go_human.shape

(95323, 17)

In [18]:
#bp_go_human.head()

In [19]:
bp_go_human = bp_go_human[[1,2,4]]

In [20]:
bp_go_human.head()

Unnamed: 0,1,2,4
3,A0A075B6H7,IGKV3-7,GO:0002377
5,A0A075B6H7,IGKV3-7,GO:0006955
6,A0A075B6H8,IGKV1D-42,GO:0002377
8,A0A075B6H8,IGKV1D-42,GO:0006955
10,A0A075B6H9,IGLV4-69,GO:0002377


map Uniprot to id gene symbol 

In [21]:
lst = []

for i,index in enumerate(bp_go_human.index):
    
    progressPercent = ((i+1)/len(bp_go_human.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(bp_go_human.index)))
    sys.stdout.flush()
    
    geneID = bp_go_human.ix[index, 1]
    if geneID in gene_mapping.index:
        if type(gene_mapping.ix[geneID, 'Approved Symbol']) == str:
            lst.append(gene_mapping.ix[geneID, 'Approved Symbol'])
        else:
            lst.append(bp_go_human.ix[index, 2])
    else:
        lst.append(np.nan)

bp_go_human[1] = lst
bp_go_human.drop(2, axis=1, inplace=True)

Progress: 100%  95323 Out of 95323   

Drop genes that dont map to NCBI entraz ids

In [22]:
bp_go_human.shape

(95323, 2)

In [23]:
bp_go_human.dropna(how='any', inplace=True)

In [24]:
bp_go_human.shape

(94102, 2)

map GO term to id for lower level (i.e. detailed) terms (level defined by depth variable)

In [25]:
lst = []

for index in bp_go_human.index:
    termID = bp_go_human.ix[index, 4]
    ansestors = go_tems.ix[termID, 'ANCESTOR'].split('|')
    if len(ansestors) > depth:
        lst.append(go_tems.ix[termID, 'TERM'])
    else:
        lst.append(np.nan)

bp_go_human[5] = lst

In [26]:
bp_go_human.head()

Unnamed: 0,1,4,5
10,IGLV4-69,GO:0002377,
14,IGLV4-69,GO:0006955,
16,IGLV8-61,GO:0002377,
20,IGLV8-61,GO:0006955,
22,IGLV4-60,GO:0002377,


Drop High Level (too general description as defined by depth)

In [27]:
bp_go_human.shape

(94102, 3)

In [28]:
bp_go_human.dropna(inplace=True)

In [29]:
bp_go_human.shape

(90995, 3)

In [30]:
bp_go_human.head()

Unnamed: 0,1,4,5
145,IGKV2-28,GO:0006898,receptor-mediated endocytosis
147,IGKV2-28,GO:0006956,complement activation
148,IGKV2-28,GO:0006958,"complement activation, classical pathway"
149,IGKV2-28,GO:0030449,regulation of complement activation
150,IGKV2-28,GO:0038095,Fc-epsilon receptor signaling pathway


create gene set library per go term

In [31]:
filenameTSV = 'output/go_biological_process_human_gene_set_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
filenameGMT = 'output/go_biological_process_human_gene_set_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')

if os.path.isfile(filenameTSV):
    os.remove(filenameTSV)
if os.path.isfile(filenameGMT):
    os.remove(filenameGMT)

for i,term in enumerate(bp_go_human[5].unique()):
    
    progressPercent = ((i+1)/len(bp_go_human[5].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(bp_go_human[5].unique())))
    sys.stdout.flush()
    
    ID = bp_go_human[bp_go_human[5]==term][4].tolist()[0]
    lst = bp_go_human[bp_go_human[5]==term][1].tolist()
    lst = list(set(lst))
    if len(lst) >= min_genes and len(lst) <= max_genes:
        lst.insert(0,term)
        lst.insert(1, '('+ID+')')
        lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
        lst.insert(len(lst), '\n') # add a newline char at the end of each lst
        
        with open(filenameTSV, 'a') as the_file:
            the_file.writelines(lst)

        with open(filenameGMT, 'a') as the_file:
            the_file.writelines(lst)

Progress: 100%  9458 Out of 9458   

create go term set library per gene

In [32]:
filenameTSV = 'output/go_biological_process_human_attibute _set_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
filenameGMT = 'output/go_biological_process_human_attribute_set_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')

if os.path.isfile(filenameTSV):
    os.remove(filenameTSV)
if os.path.isfile(filenameGMT):
    os.remove(filenameGMT)

for i,gene in enumerate(bp_go_human[1].unique()):
    
    progressPercent = ((i+1)/len(bp_go_human[1].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(bp_go_human[1].unique())))
    sys.stdout.flush()
    
    lst = bp_go_human[bp_go_human[1]==gene][5].tolist()
    lst = list(set(lst))
    if len(lst) >= min_terms and len(lst) <= max_terms:
        lst.insert(0, gene)
        lst.insert(1, 'NA')
        lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
        lst.insert(len(lst), '\n') # add a newline char at the end of each lst
        
        with open(filenameTSV, 'a') as the_file:
            the_file.writelines(lst)

        with open(filenameGMT, 'a') as the_file:
            the_file.writelines(lst)

Progress: 100%  14082 Out of 14082   

create binary matrix

In [33]:
genes = bp_go_human[1].unique()
attributes = bp_go_human[5].unique()

bp_df = pd.DataFrame(index=genes, columns=attributes, data=0)

for i, gene in enumerate(genes):
    
    progressPercent = ((i+1)/len(genes))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(genes)))
    sys.stdout.flush()
    
    col = bp_go_human[bp_go_human[1]==gene][5].tolist()
    bp_df.ix[gene, col] = 1
    
# filter out any row are column that doesnt have at least 'min' values
bp_df.replace(0, np.nan, inplace=True)
bp_df.dropna(thresh=min_terms, axis=0,inplace=True)
bp_df.dropna(thresh=min_genes, axis=1,inplace=True)
bp_df.replace(np.nan, 0, inplace=True)

# filter out any row are column that has more than 'max' values
bp_df.replace(1, np.nan, inplace=True)
bp_df.dropna(thresh=len(bp_df.columns)-max_terms, axis=0,inplace=True)
bp_df.dropna(thresh=len(bp_df.index)-max_genes, axis=1,inplace=True)
bp_df.replace(np.nan, 1, inplace=True)
    
filename = 'output/go_biological_process_binary_matrix__%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
bp_df.to_csv(filename, sep='\t')

Progress: 100%  14082 Out of 14082   

In [34]:
# Check the Jaccard distance between the genes
bp_gene_similarity_matrix = dist.pdist(bp_df, 'jaccard')
bp_gene_similarity_matrix = dist.squareform(bp_gene_similarity_matrix)
bp_gene_similarity_df = pd.DataFrame(data=bp_gene_similarity_matrix[0:,0:], index=bp_df.index, columns=bp_df.index)
bp_gene_similarity_df = bp_gene_similarity_df.applymap(lambda x: 1-x)

filename = 'output/go_biological_process_gene_similarity_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
bp_gene_similarity_df.to_csv(filename, sep='\t')

In [35]:
# Check the cosine distance between the samples (attributes)
bp_attribute_similarity_matrix = dist.pdist(bp_df.T, 'jaccard')
bp_attribute_similarity_matrix = dist.squareform(bp_attribute_similarity_matrix)
bp_attribute_similarity_df = pd.DataFrame(data=bp_attribute_similarity_matrix[0:,0:], index=bp_df.columns, columns=bp_df.columns)
bp_attribute_similarity_df = bp_attribute_similarity_df.applymap(lambda x: 1-x)

filename = 'output/go_biological_process_attribute_similarity_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
bp_attribute_similarity_df.to_csv(filename, sep='\t')

create gene list

In [36]:
gene_list = pd.DataFrame(columns=['GeneSym', 'UniprotACC', 'GeneID'])

gene_list['GeneSym'] = bp_df.index

gene_mapping.reset_index(inplace=True)
gene_mapping.set_index('Approved Symbol', inplace=True)
for index in gene_list.index:
    symbol = gene_list.ix[index, 'GeneSym']
    gene_list.ix[index, 'GeneID'] = int(gene_mapping.ix[symbol, 'Entrez Gene ID'])
    gene_list.ix[index, 'UniprotACC'] = gene_mapping.ix[symbol, 'UniProt ID(supplied by UniProt)']
gene_mapping.reset_index(inplace=True)
gene_mapping.set_index('UniProt ID(supplied by UniProt)', inplace=True)

filename = 'output/go_biological_process_gene_list_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, index=False,sep='\t')

In [37]:
gene_list.head()

Unnamed: 0,GeneSym,UniprotACC,GeneID
0,IGKV2-28,A0A075B6P5,28921
1,IGHV3-64,A0A075B6Q5,28414
2,IGHV4-4,A0A075B6R2,28401
3,IGKV2D-30,A0A075B6S6,28881
4,IGHV4-30-2,A0A087WSY4,28398


create attribute list

In [38]:
attribute_list = pd.DataFrame(columns=['GO Phrase', 'GO', 'NA'])

attribute_list['GO Phrase'] = bp_df.columns

for index in attribute_list.index:
    term = attribute_list.ix[index, 'GO Phrase']
    attribute_list['GO'] = bp_go_human[bp_go_human[5]==term][4].tolist()[0]

attribute_list['NA'] = 'na'

filename = 'output/go_biological_process_attribute_list_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, index=False,sep='\t')

In [39]:
attribute_list.head()

Unnamed: 0,GO Phrase,GO,NA
0,receptor-mediated endocytosis,GO:0016573,na
1,complement activation,GO:0016573,na
2,"complement activation, classical pathway",GO:0016573,na
3,regulation of complement activation,GO:0016573,na
4,Fc-epsilon receptor signaling pathway,GO:0016573,na


create gene attribute edges

In [40]:
bp_gene_attribute_edges = pd.DataFrame(columns=['source', 'source_desc', 'source_id', 'target', 'target_desc', 'target_id', 'weight'])

bp_gene_attribute_edges['source'] = gene_list['GeneSym']
bp_gene_attribute_edges['source_desc'] = gene_list['UniprotACC']
bp_gene_attribute_edges['source_id'] = gene_list['GeneID']
bp_gene_attribute_edges['target'] = attribute_list['GO Phrase']
bp_gene_attribute_edges['target_desc'] = attribute_list['GO']
bp_gene_attribute_edges['target_id'] = attribute_list['NA']
bp_gene_attribute_edges['weight'] = 1.0

col = ['GeneSym', 'UniProtACC', 'GeneID', 'GO Phrase', 'GO', 'NA', 'weight']
line = pd.DataFrame(data=col).T
line.columns = ['source', 'source_desc', 'source_id', 'target', 'target_desc', 'target_id', 'weight']
bp_gene_attribute_edges = pd.concat([line, bp_gene_attribute_edges]).reset_index(drop=True)

filename = 'output/go_biological_process_gene_attribute_edges_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
bp_gene_attribute_edges.to_csv(filename, index=False,sep='\t')

In [41]:
bp_gene_attribute_edges.head()

Unnamed: 0,source,source_desc,source_id,target,target_desc,target_id,weight
0,GeneSym,UniProtACC,GeneID,GO Phrase,GO,,weight
1,IGKV2-28,A0A075B6P5,28921,receptor-mediated endocytosis,GO:0016573,na,1
2,IGHV3-64,A0A075B6Q5,28414,complement activation,GO:0016573,na,1
3,IGHV4-4,A0A075B6R2,28401,"complement activation, classical pathway",GO:0016573,na,1
4,IGKV2D-30,A0A075B6S6,28881,regulation of complement activation,GO:0016573,na,1


### Cellular Component (C)

In [42]:
cc_go_human = go_human[go_human[8]=='C'].copy()

In [43]:
cc_go_human.shape

(140382, 17)

filter data by eveidance type (currently not in use)

In [44]:
cc_go_human = cc_go_human[~cc_go_human[6].isin(['IEA'])]

In [45]:
cc_go_human.shape

(121001, 17)

In [46]:
cc_go_human = cc_go_human[[1,2,4]]

In [47]:
cc_go_human.head()

Unnamed: 0,1,2,4
4,A0A075B6H7,IGKV3-7,GO:0005615
7,A0A075B6H8,IGKV1D-42,GO:0005615
12,A0A075B6H9,IGLV4-69,GO:0005615
18,A0A075B6I0,IGLV8-61,GO:0005615
24,A0A075B6I1,IGLV4-60,GO:0005615


map Uniprot to id gene symbol 

In [48]:
lst = []

for i,index in enumerate(cc_go_human.index):
    
    progressPercent = ((i+1)/len(cc_go_human.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(cc_go_human.index)))
    sys.stdout.flush()
    
    geneID = cc_go_human.ix[index, 1]
    if geneID in gene_mapping.index:
        if type(gene_mapping.ix[geneID, 'Approved Symbol']) == str:
            lst.append(gene_mapping.ix[geneID, 'Approved Symbol'])
        else:
            lst.append(cc_go_human.ix[index, 2])
    else:
        lst.append(np.nan)

cc_go_human[1] = lst
cc_go_human.drop(2, axis=1, inplace=True)

Progress: 100%  121001 Out of 121001   

In [49]:
cc_go_human.head()

Unnamed: 0,1,4
4,,GO:0005615
7,,GO:0005615
12,IGLV4-69,GO:0005615
18,IGLV8-61,GO:0005615
24,IGLV4-60,GO:0005615


Drop genes that dont map to NCBI entraz ids

In [50]:
cc_go_human.shape

(121001, 2)

In [51]:
cc_go_human.dropna(inplace=True)

In [52]:
cc_go_human.shape

(119695, 2)

map GO term to id for lower level (i.e. detailed) terms (level defined by depth variable)

In [53]:
lst = []

for index in cc_go_human.index:
    termID = cc_go_human.ix[index, 4]
    ansestors = go_tems.ix[termID, 'ANCESTOR'].split('|')
    if len(ansestors) > depth:
        lst.append(go_tems.ix[termID, 'TERM'])
    else:
        lst.append(np.nan)

cc_go_human[5] = lst

In [54]:
cc_go_human.head()

Unnamed: 0,1,4,5
12,IGLV4-69,GO:0005615,
18,IGLV8-61,GO:0005615,
24,IGLV4-60,GO:0005615,
33,IGLV10-54,GO:0005615,
45,IGLV7-46,GO:0005615,


Drop High Level (too general description as defined by depth)

In [55]:
cc_go_human.shape

(119695, 3)

In [56]:
cc_go_human.dropna(inplace=True)

In [57]:
cc_go_human.shape

(104082, 3)

create gene set library per go term

In [58]:
filenameTSV = 'output/go_cellular_component_human_gene_set_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
filenameGMT = 'output/go_cellular_component_human_gene_set_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')

if os.path.isfile(filenameTSV):
    os.remove(filenameTSV)
if os.path.isfile(filenameGMT):
    os.remove(filenameGMT)

for i,term in enumerate(cc_go_human[5].unique()):
    
    progressPercent = ((i+1)/len(cc_go_human[5].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(cc_go_human[5].unique())))
    sys.stdout.flush()
    
    ID = cc_go_human[cc_go_human[5]==term][4].tolist()[0]
    lst = cc_go_human[cc_go_human[5]==term][1].tolist()
    lst = list(set(lst))
    if len(lst) >= min_genes and len(lst) <= max_genes:
        lst.insert(0,term)
        lst.insert(1, '('+ID+')')
        lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
        lst.insert(len(lst), '\n') # add a newline char at the end of each lst
        
        with open(filenameTSV, 'a') as the_file:
            the_file.writelines(lst)

        with open(filenameGMT, 'a') as the_file:
            the_file.writelines(lst)

Progress: 100%  1340 Out of 1340   

create go term set library per gene

In [59]:
filenameTSV = 'output/go_cellular_component_human_attibute _set_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
filenameGMT = 'output/go_cellular_component_human_attribute_set_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')

if os.path.isfile(filenameTSV):
    os.remove(filenameTSV)
if os.path.isfile(filenameGMT):
    os.remove(filenameGMT)

for i,gene in enumerate(cc_go_human[1].unique()):
    
    progressPercent = ((i+1)/len(cc_go_human[1].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(cc_go_human[1].unique())))
    sys.stdout.flush()
    
    lst = cc_go_human[cc_go_human[1]==gene][5].tolist()
    lst = list(set(lst))
    if len(lst) >= min_terms and len(lst) <= max_terms:
        lst.insert(0, gene)
        lst.insert(1, 'NA')
        lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
        lst.insert(len(lst), '\n') # add a newline char at the end of each lst
        
        with open(filenameTSV, 'a') as the_file:
            the_file.writelines(lst)

        with open(filenameGMT, 'a') as the_file:
            the_file.writelines(lst)

Progress: 100%  14373 Out of 14373   

create binary matrix

In [60]:
genes = cc_go_human[1].unique()
attributes = cc_go_human[5].unique()

cc_df = pd.DataFrame(index=genes, columns=attributes, data=0)

for i, gene in enumerate(genes):
    
    progressPercent = ((i+1)/len(genes))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(genes)))
    sys.stdout.flush()
    
    col = cc_go_human[cc_go_human[1]==gene][5].tolist()
    cc_df.ix[gene, col] = 1
    
# filter out any row are column that doesnt have at least 'min' values
cc_df.replace(0, np.nan, inplace=True)
cc_df.dropna(thresh=min_terms, axis=0,inplace=True)
cc_df.dropna(thresh=min_genes, axis=1,inplace=True)
cc_df.replace(np.nan, 0, inplace=True)

# filter out any row are column that has more than 'max' values
cc_df.replace(1, np.nan, inplace=True)
cc_df.dropna(thresh=len(cc_df.columns)-max_terms, axis=0,inplace=True)
cc_df.dropna(thresh=len(cc_df.index)-max_genes, axis=1,inplace=True)
cc_df.replace(np.nan, 1, inplace=True)
    
filename = 'output/go_cellular_component_binary_matrix_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
cc_df.to_csv(filename, sep='\t')

Progress: 100%  14373 Out of 14373   

In [61]:
# Check the Jaccard distance between the genes
cc_gene_similarity_matrix = dist.pdist(cc_df, 'jaccard')
cc_gene_similarity_matrix = dist.squareform(cc_gene_similarity_matrix)
cc_gene_similarity_df = pd.DataFrame(data=cc_gene_similarity_matrix[0:,0:], index=cc_df.index, columns=cc_df.index)
cc_gene_similarity_df = cc_gene_similarity_df.applymap(lambda x: 1-x)

filename = 'output/go_cellular_component_gene_similarity_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
cc_gene_similarity_df.to_csv(filename, sep='\t')

In [62]:
# Check the cosine distance between the samples (attributes)
cc_attribute_similarity_matrix = dist.pdist(cc_df.T, 'jaccard')
cc_attribute_similarity_matrix = dist.squareform(cc_attribute_similarity_matrix)
cc_attribute_similarity_df = pd.DataFrame(data=cc_attribute_similarity_matrix[0:,0:], index=cc_df.columns, columns=cc_df.columns)
cc_attribute_similarity_df = cc_attribute_similarity_df.applymap(lambda x: 1-x)

filename = 'output/go_cellular_component_attribute_similarity_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
cc_attribute_similarity_df.to_csv(filename, sep='\t')

create gene list

In [63]:
gene_list = pd.DataFrame(columns=['GeneSym', 'UniprotACC', 'GeneID'])

gene_list['GeneSym'] = cc_df.index

gene_mapping.reset_index(inplace=True)
gene_mapping.set_index('Approved Symbol', inplace=True)
for index in gene_list.index:
    symbol = gene_list.ix[index, 'GeneSym']
    gene_list.ix[index, 'GeneID'] = int(gene_mapping.ix[symbol, 'Entrez Gene ID'])
    gene_list.ix[index, 'UniprotACC'] = gene_mapping.ix[symbol, 'UniProt ID(supplied by UniProt)']
gene_mapping.reset_index(inplace=True)
gene_mapping.set_index('UniProt ID(supplied by UniProt)', inplace=True)

filename = 'output/go_cellular_component_gene_list_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, index=False,sep='\t')

create attribute list

In [64]:
attribute_list = pd.DataFrame(columns=['GO Phrase', 'GO', 'NA'])

attribute_list['GO Phrase'] = cc_df.columns

for index in attribute_list.index:
    term = attribute_list.ix[index, 'GO Phrase']
    attribute_list['GO'] = cc_go_human[cc_go_human[5]==term][4].tolist()[0]

attribute_list['NA'] = 'na'

filename = 'output/go_cellular_component_attribute_list_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, index=False,sep='\t')

create gene attribute edges

In [65]:
cc_gene_attribute_edges = pd.DataFrame(columns=['source', 'source_desc', 'source_id', 'target', 'target_desc', 'target_id', 'weight'])

cc_gene_attribute_edges['source'] = gene_list['GeneSym']
cc_gene_attribute_edges['source_desc'] = gene_list['UniprotACC']
cc_gene_attribute_edges['source_id'] = gene_list['GeneID']
cc_gene_attribute_edges['target'] = attribute_list['GO Phrase']
cc_gene_attribute_edges['target_desc'] = attribute_list['GO']
cc_gene_attribute_edges['target_id'] = attribute_list['NA']
cc_gene_attribute_edges['weight'] = 1.0

col = ['GeneSym', 'UniProtACC', 'GeneID', 'GO Phrase', 'GO', 'NA', 'weight']
line = pd.DataFrame(data=col).T
line.columns = ['source', 'source_desc', 'source_id', 'target', 'target_desc', 'target_id', 'weight']
cc_gene_attribute_edges = pd.concat([line, cc_gene_attribute_edges]).reset_index(drop=True)

filename = 'output/go_cellular_component_gene_attribute_edges_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
cc_gene_attribute_edges.to_csv(filename, index=False,sep='\t')

### Molecular Function (F)

In [66]:
mf_go_human = go_human[go_human[8]=='F'].copy()

In [67]:
mf_go_human.shape

(135735, 17)

filter data by eveidance type (currently not in use)

In [68]:
mf_go_human = mf_go_human[~mf_go_human[6].isin(['IEA'])]

In [69]:
mf_go_human.shape

(118767, 17)

In [70]:
mf_go_human = mf_go_human[[1,2,4]]

In [71]:
mf_go_human.head()

Unnamed: 0,1,2,4
98,A0A075B6P5,IGKV2-28,GO:0004252
99,A0A075B6P5,IGKV2-28,GO:0004252
160,A0A075B6Q5,IGHV3-64,GO:0003823
165,A0A075B6Q5,IGHV3-64,GO:0034987
172,A0A075B6R2,IGHV4-4,GO:0003823


map Uniprot to id gene symbol 

In [72]:
 lst = []

for i,index in enumerate(mf_go_human.index):
    
    progressPercent = ((i+1)/len(mf_go_human.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(mf_go_human.index)))
    sys.stdout.flush()
    
    geneID = mf_go_human.ix[index, 1]
    if geneID in gene_mapping.index:
        if type(gene_mapping.ix[geneID, 'Approved Symbol']) == str:
            lst.append(gene_mapping.ix[geneID, 'Approved Symbol'])
        else:
            lst.append(mf_go_human.ix[index, 2])
    else:
        lst.append(np.nan)

mf_go_human[1] = lst
mf_go_human.drop(2, axis=1, inplace=True)

Progress: 100%  118767 Out of 118767   

Drop genes that dont map to NCBI entraz ids

In [73]:
mf_go_human.shape

(118767, 2)

In [74]:
mf_go_human.dropna(inplace=True)

In [75]:
mf_go_human.shape

(117515, 2)

map GO term to id for lower level (i.e. detailed) terms (level defined by depth variable)

In [76]:
lst = []

for index in mf_go_human.index:
    termID = mf_go_human.ix[index, 4]
    ansestors = go_tems.ix[termID, 'ANCESTOR'].split('|')
    if len(ansestors) > depth:
        lst.append(go_tems.ix[termID, 'TERM'])
    else:
        lst.append(np.nan)

mf_go_human[5] = lst

Drop High Level (too general description as defined by depth)

In [77]:
mf_go_human.shape

(117515, 3)

In [78]:
mf_go_human.dropna(inplace=True)

In [79]:
mf_go_human.shape

(38482, 3)

In [80]:
mf_go_human.head()

Unnamed: 0,1,4,5
98,IGKV2-28,GO:0004252,serine-type endopeptidase activity
99,IGKV2-28,GO:0004252,serine-type endopeptidase activity
165,IGHV3-64,GO:0034987,immunoglobulin receptor binding
177,IGHV4-4,GO:0034987,immunoglobulin receptor binding
207,IGKV2D-30,GO:0004252,serine-type endopeptidase activity


create gene set library per go term

In [81]:
filenameTSV = 'output/go_molecular_function_gene_set_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
filenameGMT = 'output/go_molecular_function_gene_set_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')

if os.path.isfile(filenameTSV):
    os.remove(filenameTSV)
if os.path.isfile(filenameGMT):
    os.remove(filenameGMT)

for i,term in enumerate(mf_go_human[5].unique()):
    
    progressPercent = ((i+1)/len(mf_go_human[5].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(mf_go_human[5].unique())))
    sys.stdout.flush()
    
    ID = mf_go_human[mf_go_human[5]==term][4].tolist()[0]
    lst = mf_go_human[mf_go_human[5]==term][1].tolist()
    lst = list(set(lst))
    if len(lst) >= min_genes and len(lst) <= max_genes:
        lst.insert(0,term)
        lst.insert(1, '('+ID+')')
        lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
        lst.insert(len(lst), '\n') # add a newline char at the end of each lst
        
        with open(filenameTSV, 'a') as the_file:
            the_file.writelines(lst)

        with open(filenameGMT, 'a') as the_file:
            the_file.writelines(lst)

Progress: 100%  3370 Out of 3370   

create go term set library per gene

In [82]:
filenameTSV = 'output/go_molecular_function_human_attibute _set_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
filenameGMT = 'output/go_molecular_function_human_attribute_set_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')

if os.path.isfile(filenameTSV):
    os.remove(filenameTSV)
if os.path.isfile(filenameGMT):
    os.remove(filenameGMT)

for i,gene in enumerate(mf_go_human[1].unique()):
    
    progressPercent = ((i+1)/len(mf_go_human[1].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(mf_go_human[1].unique())))
    sys.stdout.flush()
    
    lst = mf_go_human[mf_go_human[1]==gene][5].tolist()
    lst = list(set(lst))
    if len(lst) >= min_terms and len(lst) <= max_terms:
        lst.insert(0, gene)
        lst.insert(1, 'NA')
        lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
        lst.insert(len(lst), '\n') # add a newline char at the end of each lst
        
        with open(filenameTSV, 'a') as the_file:
            the_file.writelines(lst)

        with open(filenameGMT, 'a') as the_file:
            the_file.writelines(lst)

Progress: 100%  11577 Out of 11577   

create binary matrix

In [83]:
genes = mf_go_human[1].unique()
attributes = mf_go_human[5].unique()

mf_df = pd.DataFrame(index=genes, columns=attributes, data=0)

for i, gene in enumerate(genes):
    
    progressPercent = ((i+1)/len(genes))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(genes)))
    sys.stdout.flush()
    
    col = mf_go_human[mf_go_human[1]==gene][5].tolist()
    mf_df.ix[gene, col] = 1

# filter out any row are column that doesnt have at least 'min' values
mf_df.replace(0, np.nan, inplace=True)
mf_df.dropna(thresh=min_terms, axis=0,inplace=True)
mf_df.dropna(thresh=min_genes, axis=1,inplace=True)
mf_df.replace(np.nan, 0, inplace=True)

# filter out any row are column that has more than 'max' values
mf_df.replace(1, np.nan, inplace=True)
mf_df.dropna(thresh=len(mf_df.columns)-max_terms, axis=0,inplace=True)
mf_df.dropna(thresh=len(mf_df.index)-max_genes, axis=1,inplace=True)
mf_df.replace(np.nan, 1, inplace=True)
    
filename = 'output/go_molecular_function_binary_matrix_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
mf_df.to_csv(filename, sep='\t')

Progress: 100%  11577 Out of 11577   

In [84]:
# Check the Jaccard distance between the genes
mf_gene_similarity_matrix = dist.pdist(mf_df, 'jaccard')
mf_gene_similarity_matrix = dist.squareform(mf_gene_similarity_matrix)
mf_gene_similarity_df = pd.DataFrame(data=mf_gene_similarity_matrix[0:,0:], index=mf_df.index, columns=mf_df.index)
mf_gene_similarity_df = mf_gene_similarity_df.applymap(lambda x: 1-x)

filename = 'output/go_molecular_function_gene_similarity_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
mf_gene_similarity_df.to_csv(filename, sep='\t')

In [85]:
# Check the cosine distance between the samples (attributes)
mf_attribute_similarity_matrix = dist.pdist(mf_df.T, 'jaccard')
mf_attribute_similarity_matrix = dist.squareform(mf_attribute_similarity_matrix)
mf_attribute_similarity_df = pd.DataFrame(data=mf_attribute_similarity_matrix[0:,0:], index=mf_df.columns, columns=mf_df.columns)
mf_attribute_similarity_df = mf_attribute_similarity_df.applymap(lambda x: 1-x)

filename = 'output/go_molecular_function_attribute_similarity_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
mf_attribute_similarity_df.to_csv(filename, sep='\t')

create gene list

In [86]:
gene_list = pd.DataFrame(columns=['GeneSym', 'UniprotACC', 'GeneID'])

gene_list['GeneSym'] = mf_df.index

gene_mapping.reset_index(inplace=True)
gene_mapping.set_index('Approved Symbol', inplace=True)
for index in gene_list.index:
    symbol = gene_list.ix[index, 'GeneSym']
    gene_list.ix[index, 'GeneID'] = int(gene_mapping.ix[symbol, 'Entrez Gene ID'])
    gene_list.ix[index, 'UniprotACC'] = gene_mapping.ix[symbol, 'UniProt ID(supplied by UniProt)']
gene_mapping.reset_index(inplace=True)
gene_mapping.set_index('UniProt ID(supplied by UniProt)', inplace=True)

filename = 'output/go_molecular_function_gene_list_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, index=False,sep='\t')

create attribute list

In [87]:
attribute_list = pd.DataFrame(columns=['GO Phrase', 'GO', 'NA'])

attribute_list['GO Phrase'] = mf_df.columns

for index in attribute_list.index:
    term = attribute_list.ix[index, 'GO Phrase']
    attribute_list['GO'] = mf_go_human[mf_go_human[5]==term][4].tolist()[0]

attribute_list['NA'] = 'na'

filename = 'output/go_molecular_function_attribute_list_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, index=False,sep='\t')

create gene attribute edges

In [88]:
mf_gene_attribute_edges = pd.DataFrame(columns=['source', 'source_desc', 'source_id', 'target', 'target_desc', 'target_id', 'weight'])

mf_gene_attribute_edges['source'] = gene_list['GeneSym']
mf_gene_attribute_edges['source_desc'] = gene_list['UniprotACC']
mf_gene_attribute_edges['source_id'] = gene_list['GeneID']
mf_gene_attribute_edges['target'] = attribute_list['GO Phrase']
mf_gene_attribute_edges['target_desc'] = attribute_list['GO']
mf_gene_attribute_edges['target_id'] = attribute_list['NA']
mf_gene_attribute_edges['weight'] = 1.0

col = ['GeneSym', 'UniProtACC', 'GeneID', 'GO Phrase', 'GO', 'NA', 'weight']
line = pd.DataFrame(data=col).T
line.columns = ['source', 'source_desc', 'source_id', 'target', 'target_desc', 'target_id', 'weight']
mf_gene_attribute_edges = pd.concat([line, mf_gene_attribute_edges]).reset_index(drop=True)

filename = 'output/go_molecular_function_gene_attribute_edges_%s.tsv'% str(datetime.date.today())[0:7].replace('-', '_')
mf_gene_attribute_edges.to_csv(filename, index=False,sep='\t')