In [1]:
import os
import pandas as pd
import numpy as np
from analytics_core.analytics import analytics
from graphdb_connector import connector

  **kwargs

R object inheriting from "POSIXct" but without attribute "tzone".


R object inheriting from "POSIXct" but without attribute "tzone".



In [2]:
def run_enrichment(go_annotation_df, foreground_list, background_list, foreground_pop, background_pop):
    '''
    param go_annotation_df: pandas dataframe with annotations for features (columns: 'annotation', 'identifier' (feature identifiers), and 'source').
    param foreground_list: list of proteins of interest (e.g. regulated);
    param background_list: list of proteins to calculate the enrichment against;
    param foreground_pop: integer size(len) of the foreground list;
    param background_pop: integer size(len) of the background list;
    '''
    annotation_df = go_annotation_df.copy()
    grouping = []
    for i in annotation_df['identifier']:
        if i in foreground_list:
            grouping.append('foreground')
        elif i in background_list:
            grouping.append('background')
        else:
            grouping.append(np.nan)
    annotation_df['group'] = grouping
    annotation_df = annotation_df.dropna(subset=['group'])
    result = analytics.run_enrichment(annotation_df, foreground_id='foreground', background_id='background', foreground_pop=foreground_pop, background_pop=background_pop, annotation_col='annotation', group_col='group', identifier_col='identifier', method='fisher', correction='fdr_bh')
    return result

### Import proteomics data

### Load significantly regulated proteomics data given a mutated drug state in the VAE

In [3]:
DATAFOLDER = 'projects/xxx/drug_assosiations_proteomics'
files = [file for file in os.listdir(DATAFOLDER) if '.txt' in file]

In [4]:
file_names = [i.split('.')[0].split('_')[1] for i in files]
file_paths = [os.path.join(DATAFOLDER, DATA) for DATA in files]
files_data = [pd.read_csv(f_data, sep='\t', header=None) for f_data in file_paths]
for df in files_data:
    df.columns=['Gene name']
files_dict = dict(zip(file_names, files_data))
file_names

['Acetylsalicylic',
 'Amlodipine',
 'Atenolol',
 'Atorvastatin',
 'Bendroflumethiazide',
 'Bisoprolol',
 'Codeine',
 'Enalapril',
 'Hydrochlorothiazide',
 'Lansoprazole',
 'Lisinopril',
 'Losartan',
 'Metformin',
 'Metoprolol',
 'Omeprazole',
 'Paracetamol',
 'Ramipril',
 'Salbutamol',
 'Simvastatin']

In [5]:
for i in files_data:
    print (i.shape)

(39, 1)
(47, 1)
(9, 1)
(7, 1)
(38, 1)
(19, 1)
(17, 1)
(7, 1)
(23, 1)
(16, 1)
(2, 1)
(5, 1)
(47, 1)
(8, 1)
(46, 1)
(28, 1)
(14, 1)
(12, 1)
(23, 1)


### Load background proteomics data 
(all proteins selectively measured by a combination of various immunoassay-based platforms, [reference here](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7304567/))

In [6]:
background_proteomics = pd.read_csv('projects/xxx/proteomics_incl.txt', sep = '\t', header=None)
background_proteomics.columns=['Gene name']
BACKGROUND_PROTEINS = background_proteomics['Gene name'].unique().tolist()

### Query CKG database for protein annotations

### Before start:
- have CKG running in Neo4j. 
- make sure that the password you defined for it in Neo4j is the same you have defined in CKG/src/graphdb_connector/connector_config.yml Password in Neo4j, go to CKG, under Administration--> set password

In [7]:
from graphdb_connector import query_utils, connector

##### We connect to CKG database using the default configuration

##### We load the existing database queries that we can use to extract knowledge from CKG

In [8]:
driver = connector.getGraphDatabaseConnectionConfiguration()
queries = query_utils.read_knowledge_queries()
selected_queries = query_utils.find_queries_involving_nodes(queries=queries, nodes=["Protein", "Drug"], print_pretty=True)
selected_queries.head()

DEBUG:neo4j:[#0000]  C: <RESOLVE> 0.0.0.0:7687
DEBUG:neo4j:[#0000]  C: <OPEN> 0.0.0.0:7687
DEBUG:neo4j:[#E037]  C: <MAGIC> 0x6060B017
DEBUG:neo4j:[#E037]  C: <HANDSHAKE> 0x00000204 0x00000104 0x00000004 0x00000003
DEBUG:neo4j:[#E037]  S: <HANDSHAKE> 0x00000003
DEBUG:neo4j:[#E037]  C: HELLO {'user_agent': 'neo4j-python/4.2.0 Python/3.6.12-final-0 (darwin)', 'scheme': 'basic', 'principal': 'neo4j', 'credentials': '*******'}
DEBUG:neo4j:[#E037]  S: SUCCESS {'server': 'Neo4j/3.5.12', 'connection_id': 'bolt-26'}


Unnamed: 0_level_0,Name,Description,involved_nodes,involved_rels,query,example
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Drug,associated drugs in at least two of the protei...,get relationships to drugs. Limit the result t...,"Protein,Drug",ACTS_ON,MATCH (protein:Protein)-[r:ACTS_ON]-(drug:Drug...,
association_drug_intervention_proteins,drug intervention- protein association,Return associations between a list of proteins...,"Project,Protein,Clinical_variable,Drug","HAD_INTERVENTION,ACTS_ON",MATCH (project:Project)-[]-()-[:HAD_INTERVENTI...,project_id = 'P0000002'\nproteins = ['A1BG~P04...
association_drug_interaction_score,drug interaction association,Return the list of drugs associated to the lis...,"Protein,Drug",ACTS_ON,MATCH (protein:Protein)-[r]-(drug:Drug) WHERE ...,"proteins = ['A1BG~P04217','A2M~P01023','ACACB~..."


### If you only have gene names, instead of querying for (p.name+'~'+p.id), you would just do p.name

In [9]:
annotation_query_bp = 'MATCH (p:Protein)-[r:ASSOCIATED_WITH]-(bp:Biological_process) WHERE (p.name) IN LIST RETURN DISTINCT (p.name) AS identifier, bp.name AS annotation'
annotation_query_mf = 'MATCH (p:Protein)-[r:ASSOCIATED_WITH]-(bp:Molecular_function) WHERE (p.name) IN LIST RETURN DISTINCT (p.name) AS identifier, bp.name AS annotation'
annotation_query_drug = 'MATCH (protein:Protein)-[r:ACTS_ON]-(drug:Drug) WHERE (protein.name) IN LIST AND toFloat(r.score)>=0.2 RETURN (protein.name) AS identifier, drug.name AS annotation, drug.id AS drug_id, drug.description AS Drug_desc, r.score AS weight, type(r) AS type, r.action AS action, r.source AS source\n'
annotation_query_pathway = 'MATCH (protein:Protein)-[rp:ANNOTATED_IN_PATHWAY]->(pathway:Pathway) WHERE (protein.name) IN LIST AND toLower(rp.source) IN ["reactome"] RETURN DISTINCT protein.name AS identifier,pathway.name AS annotation,rp.source AS source'
annotation_query_disease = 'MATCH (p:Protein)-[r:ASSOCIATED_WITH]->(d:Disease)-[:HAS_PARENT*2..4]->(pd:Disease) WHERE (p.name) IN LIST AND pd.name="disease of metabolism" RETURN (p.name) AS identifier, d.name AS annotation, d.id AS disease_id, r.score AS score'
annotation_query_complexes= 'MATCH (p:Protein)-[r:IS_SUBUNIT_OF]->(c:Complex) WHERE (p.name) IN LIST RETURN (p.name) AS identifier, c.name AS annotation, r.score AS score'

### Replace "LIST" in the query with all the proteins (background + foreground):

In [10]:
annotation_query_bp = annotation_query_bp.replace('LIST', str(BACKGROUND_PROTEINS))
annotation_query_mf = annotation_query_mf.replace('LIST', str(BACKGROUND_PROTEINS))
annotation_query_pathway = annotation_query_pathway.replace('LIST', str(BACKGROUND_PROTEINS))
annotation_query_drug = annotation_query_drug.replace('LIST', str(BACKGROUND_PROTEINS))
annotation_query_disease = annotation_query_disease.replace('LIST', str(BACKGROUND_PROTEINS))
annotation_query_complexes = annotation_query_complexes.replace('LIST', str(BACKGROUND_PROTEINS))

### And "send" the query to the database:

In [11]:
driver = connector.getGraphDatabaseConnectionConfiguration()
annotation_bp = connector.getCursorData(driver, annotation_query_bp)
annotation_mf = connector.getCursorData(driver, annotation_query_mf)
annotation_pathway = connector.getCursorData(driver, annotation_query_pathway)
annotation_complexes = connector.getCursorData(driver, annotation_query_complexes)

DEBUG:neo4j:[#0000]  C: <RESOLVE> 0.0.0.0:7687
DEBUG:neo4j:[#0000]  C: <OPEN> 0.0.0.0:7687
DEBUG:neo4j:[#E03A]  C: <MAGIC> 0x6060B017
DEBUG:neo4j:[#E03A]  C: <HANDSHAKE> 0x00000204 0x00000104 0x00000004 0x00000003
DEBUG:neo4j:[#E03A]  S: <HANDSHAKE> 0x00000003
DEBUG:neo4j:[#E03A]  C: HELLO {'user_agent': 'neo4j-python/4.2.0 Python/3.6.12-final-0 (darwin)', 'scheme': 'basic', 'principal': 'neo4j', 'credentials': '*******'}
DEBUG:neo4j:[#E03A]  S: SUCCESS {'server': 'Neo4j/3.5.12', 'connection_id': 'bolt-27'}
DEBUG:neo4j:[#E037]  C: GOODBYE
DEBUG:neo4j:[#E037]  C: <CLOSE>
DEBUG:neo4j:[#E03A]  C: BEGIN {'mode': 'r'}
DEBUG:neo4j:[#E03A]  C: RUN "MATCH (p:Protein)-[r:ASSOCIATED_WITH]-(bp:Biological_process) WHERE (p.name) IN ['KCNK16', 'POLDIP3', 'EFHC1', 'ADD1', 'NID2', 'ZBTB25', 'BCAS1', 'CPA1', 'ANK1', 'CELSR3', 'NR1H3', 'PMS1', 'IL1B', 'KDR', 'TNFRSF8', 'KIAA1244', 'COG6', 'PGF', 'NOTCH2', 'CTNNAL1', 'VEGFA', 'TCEANC2', 'CCR2', 'PPARG', 'TMEM132D', 'VWDE', 'ANKMY1', 'CARF', 'CDC123', 'P

In [12]:
annotation_drug = connector.getCursorData(driver, annotation_query_drug)

DEBUG:neo4j:[#E03A]  C: BEGIN {'mode': 'r'}
DEBUG:neo4j:[#E03A]  C: RUN "MATCH (protein:Protein)-[r:ACTS_ON]-(drug:Drug) WHERE (protein.name) IN ['KCNK16', 'POLDIP3', 'EFHC1', 'ADD1', 'NID2', 'ZBTB25', 'BCAS1', 'CPA1', 'ANK1', 'CELSR3', 'NR1H3', 'PMS1', 'IL1B', 'KDR', 'TNFRSF8', 'KIAA1244', 'COG6', 'PGF', 'NOTCH2', 'CTNNAL1', 'VEGFA', 'TCEANC2', 'CCR2', 'PPARG', 'TMEM132D', 'VWDE', 'ANKMY1', 'CARF', 'CDC123', 'PSTPIP2', 'RMND5B', 'GOLGA7B', 'JPH2', 'SYTL4', 'SCG3', 'NECAB2', 'CABP7', 'GAB1', 'DNAJC17', 'OAS3', 'CCL17', 'CCDC114', 'XIAP', 'LIN7A', 'HAS1', 'NAMPT', 'STX1A', 'KCNB2', 'SSTR3', 'FAM111B', 'CERKL', 'THBS2', 'IDO2', 'SYNE4', 'CCDC116', 'FEN1', 'LARP6', 'ZSWIM5', 'C8orf48', 'LRWD1', 'SLU7', 'TET2', 'CCL16', 'STXBP5', 'ZMAT1', 'NOL9', 'NOXRED1', 'ERAP2', 'CXCR1', 'EIF2AK3', 'PPP1R3B', 'STXBP6', 'GPR148', 'SRP72', 'PPIP5K1', 'C15orf27', 'EFCAB3', 'NANP', 'ZMAT3', 'GP9', 'IRX2', 'FGA', 'KIAA0586', 'EFS', 'ASB9', 'TP53INP1', 'ZNF330', 'FST', 'WARS', 'KLHL42', 'L3MBTL3', 'EFCAB5', 

In [13]:
annotation_disease = connector.getCursorData(driver, annotation_query_disease)

DEBUG:neo4j:[#E03A]  C: BEGIN {'mode': 'r'}
DEBUG:neo4j:[#E03A]  C: RUN 'MATCH (p:Protein)-[r:ASSOCIATED_WITH]->(d:Disease)-[:HAS_PARENT*2..4]->(pd:Disease) WHERE (p.name) IN [\'KCNK16\', \'POLDIP3\', \'EFHC1\', \'ADD1\', \'NID2\', \'ZBTB25\', \'BCAS1\', \'CPA1\', \'ANK1\', \'CELSR3\', \'NR1H3\', \'PMS1\', \'IL1B\', \'KDR\', \'TNFRSF8\', \'KIAA1244\', \'COG6\', \'PGF\', \'NOTCH2\', \'CTNNAL1\', \'VEGFA\', \'TCEANC2\', \'CCR2\', \'PPARG\', \'TMEM132D\', \'VWDE\', \'ANKMY1\', \'CARF\', \'CDC123\', \'PSTPIP2\', \'RMND5B\', \'GOLGA7B\', \'JPH2\', \'SYTL4\', \'SCG3\', \'NECAB2\', \'CABP7\', \'GAB1\', \'DNAJC17\', \'OAS3\', \'CCL17\', \'CCDC114\', \'XIAP\', \'LIN7A\', \'HAS1\', \'NAMPT\', \'STX1A\', \'KCNB2\', \'SSTR3\', \'FAM111B\', \'CERKL\', \'THBS2\', \'IDO2\', \'SYNE4\', \'CCDC116\', \'FEN1\', \'LARP6\', \'ZSWIM5\', \'C8orf48\', \'LRWD1\', \'SLU7\', \'TET2\', \'CCL16\', \'STXBP5\', \'ZMAT1\', \'NOL9\', \'NOXRED1\', \'ERAP2\', \'CXCR1\', \'EIF2AK3\', \'PPP1R3B\', \'STXBP6\', \'GPR148\', 

### Drug associations

In [14]:
DRUG = 'Metformin'
ANNOTATION = annotation_drug
REGULATED_PROTEINS = files_dict[DRUG]['Gene name'].unique().tolist()
regulated=ANNOTATION[ANNOTATION['identifier'].isin(REGULATED_PROTEINS)].sort_values(by='annotation')
regulated[regulated['annotation']==DRUG]

Unnamed: 0,Drug_desc,action,annotation,drug_id,identifier,source,type,weight
2190,Metformin is a biguanide antihyperglycemic age...,inhibition,Metformin,DB00331,MMP9,STITCH,ACTS_ON,0.7
2195,Metformin is a biguanide antihyperglycemic age...,inhibition,Metformin,DB00331,PGF,STITCH,ACTS_ON,0.8


#### Enrichment 

In [15]:
foreground_list=REGULATED_PROTEINS
background_list=BACKGROUND_PROTEINS
annotation = ANNOTATION

In [16]:
print(len(background_list), len(foreground_list))

260 46


In [17]:
enrichment_result = run_enrichment(annotation, 
                                   foreground_list=foreground_list, 
                                   background_list=background_list, 
                                   foreground_pop=len(foreground_list), 
                                   background_pop=len(background_list))

ANNOTATION[ANNOTATION['annotation']==DRUG].sort_values('identifier').head()
#enrichment_result.to_csv('projects/xxx/enrichment_drug.csv')

Unnamed: 0,Drug_desc,action,annotation,drug_id,identifier,source,type,weight
2186,Metformin is a biguanide antihyperglycemic age...,inhibition,Metformin,DB00331,CEL,STITCH,ACTS_ON,0.38
2185,Metformin is a biguanide antihyperglycemic age...,activation,Metformin,DB00331,GCGR,STITCH,ACTS_ON,0.3
2192,Metformin is a biguanide antihyperglycemic age...,activation,Metformin,DB00331,GLP1R,STITCH,ACTS_ON,0.8
2193,Metformin is a biguanide antihyperglycemic age...,inhibition,Metformin,DB00331,IGF1,STITCH,ACTS_ON,0.957
2194,Metformin is a biguanide antihyperglycemic age...,inhibition,Metformin,DB00331,IL1B,STITCH,ACTS_ON,0.8


In [18]:
enrichment_result[enrichment_result['terms']==DRUG]

Unnamed: 0,terms,identifiers,foreground,background,foreground_pop,background_pop,odds,pvalue,padj,rejected
95,Metformin,"MMP9,PGF",2,11,46,260,0.838843,1.0,1.0,False


### Disease associations

In [19]:
DRUG = 'Metformin'
ANNOTATION = annotation_disease
REGULATED_PROTEINS = files_dict[DRUG]['Gene name'].unique().tolist()
regulated=ANNOTATION[ANNOTATION['identifier'].isin(REGULATED_PROTEINS)].sort_values(by='annotation')

In [20]:
regulated.sort_values('annotation').head()

Unnamed: 0,annotation,disease_id,identifier,score
716,Keshan disease,DOID:0050083,PNLIP,0.745
183,Menkes disease,DOID:1838,TFF3,1.151
46,Smith-Lemli-Opitz syndrome,DOID:14692,SIX3,1.263
74,Tangier disease,DOID:1388,PNLIP,0.719
204,Wilson disease,DOID:893,FTL,0.06


In [21]:
regulated.shape

(257, 4)

#### Enrichment 

In [22]:
foreground_list=REGULATED_PROTEINS
background_list=BACKGROUND_PROTEINS
annotation = ANNOTATION
print(len(background_list), len(foreground_list))

260 46


In [23]:
annotation[annotation['annotation']=='diabetes mellitus'].shape

(243, 4)

In [24]:
regulated_disease = annotation[annotation['identifier'].isin(REGULATED_PROTEINS)]
regulated_disease[regulated_disease['annotation']=='diabetes mellitus'].head()

Unnamed: 0,annotation,disease_id,identifier,score
1302,diabetes mellitus,DOID:9351,PNLIP,1.882
1303,diabetes mellitus,DOID:9351,SIX3,0.763
1305,diabetes mellitus,DOID:9351,DMXL2,4.0
1306,diabetes mellitus,DOID:9351,CDC123,3.667
1319,diabetes mellitus,DOID:9351,CD40LG,2.137


In [25]:
enrichment_result = run_enrichment(annotation, 
                                   foreground_list=foreground_list, 
                                   background_list=background_list, 
                                   foreground_pop=len(foreground_list), 
                                   background_pop=len(background_list))
enrichment_result.head()
#enrichment_result.to_csv('projects/xxx/enrichment_disease.csv')

Unnamed: 0,terms,identifiers,foreground,background,foreground_pop,background_pop,odds,pvalue,padj,rejected
7,diabetes mellitus,"PNLIP,SIX3,DMXL2,CDC123,CD40LG,KCNB2,IGFBP1,AN...",33,210,46,260,0.048352,4.841893e-08,2e-06,True
18,hyperinsulinism,"IGFBP1,IGFBP1",2,47,46,260,0.161509,0.003428227,0.059994,False
27,metal metabolism disorder,"CD40LG,FTL,SLC39A13,TFF3",4,3,46,260,6.698413,0.02015843,0.176822,False
11,fatty liver disease,"IGFBP1,FST,MMP9,MMP9,MMP9,FST,CD40LG",7,71,46,260,0.361502,0.02020819,0.176822,False
24,lipoid proteinosis,"MMP9,MMP9",2,0,46,260,inf,0.03073953,0.208495,False


### Functional annotations

In [32]:
DRUG = 'Metformin'
ANNOTATION = annotation_pathway
REGULATED_PROTEINS = files_dict[DRUG]['Gene name'].unique().tolist()
regulated=ANNOTATION[ANNOTATION['identifier'].isin(REGULATED_PROTEINS)].sort_values(by='annotation')

In [33]:
regulated.sort_values('annotation').head()

Unnamed: 0,annotation,identifier,source
381,APC-Cdc20 mediated degradation of Nek2A,NEK2,Reactome
255,ATF4 activates genes in response to endoplasmi...,IGFBP1,Reactome
385,AURKA Activation by TPX2,NEK2,Reactome
175,Activated NTRK2 signals through PI3K,GAB1,Reactome
368,Activation of Matrix Metalloproteinases,MMP9,Reactome


In [34]:
regulated.shape

(80, 3)

#### Enrichment 

In [35]:
foreground_list=REGULATED_PROTEINS
background_list=BACKGROUND_PROTEINS
annotation = ANNOTATION
print(len(background_list), len(foreground_list))

260 46


In [36]:
enrichment_result = run_enrichment(annotation, 
                                   foreground_list=foreground_list, 
                                   background_list=background_list, 
                                   foreground_pop=len(foreground_list), 
                                   background_pop=len(background_list))
enrichment_result
enrichment_result.to_csv('projects/xxx/enrichment_pathway.csv')

In [37]:
import scipy.stats as stats

In [38]:
stats.fisher_exact([[2, 44], [1, 213]])

(9.681818181818182, 0.08173379103608558)