In [1]:
import psycopg2 as pg
import networkx as nx
import pandas as pd
import pandas.io.sql as psql
import os
from tqdm.notebook import tqdm
from itertools import product
import pickle
import numpy as np

In [2]:
# connect to CODA DB
conn = pg.connect(host = 'heart5.kaist.ac.kr', dbname="CODAv3.0", user='bisler', password='bislaprom3')
cur = conn.cursor()

In [3]:
def create_network(coda_df):
  relation_df = coda_df[['leftentityid', 'rightentityid']]
  relation_df = relation_df.drop_duplicates()
  reverse_df = coda_df.loc[coda_df['association']=='Undirected_Link'][['rightentityid', 'leftentityid']]
  reverse_df = reverse_df.rename(columns={'rightentityid':'leftentityid', 'leftentityid':'rightentityid'})
  reverse_df = reverse_df.drop_duplicates()
  relation_df = pd.concat([relation_df, reverse_df])
  g = nx.from_pandas_edgelist(relation_df, 'leftentityid', 'rightentityid', create_using=nx.DiGraph())
  return g

def get_edge_list(coda_df):
  relation_df = coda_df[['leftentityid', 'rightentityid']]
  relation_df = relation_df.drop_duplicates()
  return relation_df

def execute_sql_to_df(conn, sql_query):
  return psql.read_sql(sql_query, conn)

In [277]:
def get_gene_name(entrezid):
    sql_query = "SELECT symbol FROM gene where entrezid = '{}'".format(entrezid)
    cur.execute(sql_query)
    try:
        symbol = cur.fetchone()[0]
        return symbol
    except:
        pass
    
def get_gene_id(symbol):
    sql_query = "SELECT entrezid FROM gene where symbol = '{}' and ncbitaxid = '9606'".format(symbol.upper())
    cur.execute(sql_query)
    try:
        entrezid = cur.fetchone()[0]
        return entrezid
    except:
        pass

def to_dictionary(entrezid_list):
    result = dict()
    for entrezid in tqdm(entrezid_list):
        symbol = get_gene_name(entrezid)
        if symbol is not None:
            symbol = symbol.upper()
            result[symbol] = entrezid
    return result
    
def symbol2GE(symbol_list):
    result = list()
    for symbol in symbol_list:
        try:
            sql_query = "SELECT geneid from gene where symbol = '{}' and ncbitaxid = '9606'".format(symbol.upper())
            cur = conn.cursor()
            cur.execute(sql_query)
            geid = cur.fetchone()[0]
            result.append(geid)
        except:
            print(symbol)
    return result

def GE2GP(ge_list):
    result = list()
    for ge in ge_list:
        sql_query = "select geneproductid from geneproduct where geneid = '{}'".format(ge)
        cur = conn.cursor()
        cur.execute(sql_query)
        gpids = cur.fetchall()
        gpids = [gpid[0] for gpid in gpids]
        result += gpids
    return result

def GP2symbol(gp_list):
    result = list()
    for gp in gp_list:
        sql_query = "select symbol from geneproduct where geneproductid = '{}'".format(gp)
        cur = conn.cursor()
        cur.execute(sql_query)
        symbol = cur.fetchone()[0]
        result.append(symbol)
    return result

def GP2symbol_single(gp):
    sql_query = "select symbol from geneproduct where geneproductid = '{}'".format(gp)
    cur = conn.cursor()
    cur.execute(sql_query)
    try:
        symbol = cur.fetchone()[0]
    except:
        symbol = None
    return symbol

def filter_gene_in_g(g, gene_list):
    return [gene for gene in gene_list if g.has_node(gene)]

In [263]:
# select non-context relations
sql_query = "SELECT * FROM knowledgeunit where leftorganid is Null and lefttissueid is Null and leftcellid is Null and rightorganid is Null and righttissueid is Null and rightcellid is Null and associationcontext is Null and lefttype='GP' and righttype='GP';"
non_context_df = execute_sql_to_df(conn, sql_query)

In [264]:
non_context_df

Unnamed: 0,kuid,lefttype,leftentityid,leftorganid,lefttissueid,leftcellid,association,righttype,rightentityid,rightorganid,righttissueid,rightcellid,associationcontext,associationinsource,speciesid,referenceid,evidencescore,level
0,1,GP,GP00000001,,,,Positive_Increase,GP,GP00000002,,,,,Bind,SP00000445,1,{'Manual_Curation': 1.0},0
1,5,GP,GP00000006,,,,Positive_Increase,GP,GP00000007,,,,,Bind,SP00000445,5,{'Manual_Curation': 1.0},0
2,6,GP,GP00000008,,,,Positive_Increase,GP,GP00000009,,,,,Bind,SP00000445,6,{'Manual_Curation': 1.0},0
3,8,GP,GP00000011,,,,Positive_Increase,GP,GP00000012,,,,,Bind,SP00000445,8,{'Manual_Curation': 1.0},0
4,13,GP,GP00000013,,,,Positive_Increase,GP,GP00000014,,,,,Bind,SP00000445,9,{'Manual_Curation': 1.0},0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618852,22187918,GP,GP00032393,,,,Directed_Link,GP,GP00045144,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0
618853,22187919,GP,GP00045273,,,,Directed_Link,GP,GP00067429,,,,,Regulate,SP00000445,69207,,0
618854,22187920,GP,GP00025526,,,,Directed_Link,GP,GP00069215,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0
618855,22187921,GP,GP00018717,,,,Directed_Link,GP,GP00074387,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0


In [265]:
# create network
ppi_df = non_context_df[(non_context_df['lefttype']=='GP') & (non_context_df['righttype']=='GP')]

GP_list = list(set(ppi_df['leftentityid'].to_list())) + list(set(ppi_df['rightentityid'].to_list()))
GP_list = list(set(GP_list))

symbol_list = GP2symbol(GP_list)
zip_iterator = zip(GP_list, symbol_list)
gp_symbol_dict = dict(zip_iterator)

ppi_df['leftentityid'] = ppi_df['leftentityid'].apply(lambda x: get_gene_id(gp_symbol_dict[x]))
ppi_df['rightentityid'] = ppi_df['rightentityid'].apply(lambda x: get_gene_id(gp_symbol_dict[x]))

symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol
symbol

In [266]:
ppi_df

Unnamed: 0,kuid,lefttype,leftentityid,leftorganid,lefttissueid,leftcellid,association,righttype,rightentityid,rightorganid,righttissueid,rightcellid,associationcontext,associationinsource,speciesid,referenceid,evidencescore,level
0,1,GP,5443,,,,Positive_Increase,GP,4988,,,,,Bind,SP00000445,1,{'Manual_Curation': 1.0},0
1,5,GP,3458,,,,Positive_Increase,GP,3459,,,,,Bind,SP00000445,5,{'Manual_Curation': 1.0},0
2,6,GP,3084,,,,Positive_Increase,GP,2066,,,,,Bind,SP00000445,6,{'Manual_Curation': 1.0},0
3,8,GP,3827,,,,Positive_Increase,GP,623,,,,,Bind,SP00000445,8,{'Manual_Curation': 1.0},0
4,13,GP,282617,,,,Positive_Increase,GP,3588,,,,,Bind,SP00000445,9,{'Manual_Curation': 1.0},0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618852,22187918,GP,6689,,,,Directed_Link,GP,3684,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0
618853,22187919,GP,406955,,,,Directed_Link,GP,23411,,,,,Regulate,SP00000445,69207,,0
618854,22187920,GP,1961,,,,Directed_Link,GP,1749,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0
618855,22187921,GP,6667,,,,Directed_Link,GP,2264,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0


In [267]:
ppi_df.loc[(ppi_df['leftentityid'] == '1961') & (ppi_df['rightentityid'] == '1749')]

Unnamed: 0,kuid,lefttype,leftentityid,leftorganid,lefttissueid,leftcellid,association,righttype,rightentityid,rightorganid,righttissueid,rightcellid,associationcontext,associationinsource,speciesid,referenceid,evidencescore,level
618854,22187920,GP,1961,,,,Directed_Link,GP,1749,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0


In [268]:
none_df = non_context_df[non_context_df['kuid'].isin(list(ppi_df[TF]['kuid'].values))]

none_df['leftentityid'] = none_df['leftentityid'].apply(lambda x: gp_symbol_dict[x])
none_df['rightentityid'] = none_df['rightentityid'].apply(lambda x: gp_symbol_dict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  none_df['leftentityid'] = none_df['leftentityid'].apply(lambda x: gp_symbol_dict[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  none_df['rightentityid'] = none_df['rightentityid'].apply(lambda x: gp_symbol_dict[x])


In [269]:
none_df

Unnamed: 0,kuid,lefttype,leftentityid,leftorganid,lefttissueid,leftcellid,association,righttype,rightentityid,rightorganid,righttissueid,rightcellid,associationcontext,associationinsource,speciesid,referenceid,evidencescore,level
1842,10918331,GP,ProteinComplex,,,,Positive_Increase,GP,FOS,,,,,Expression,SP00000445,50211,{'Manual_Curation': 1.0},0
3665,10920336,GP,ProteinComplex,,,,Positive_Increase,GP,RAP1B,,,,,Activation,SP00000445,50372,{'Manual_Curation': 1.0},0
3666,10920337,GP,ProteinComplex,,,,Positive_Increase,GP,RAP1A,,,,,Activation,SP00000445,50372,{'Manual_Curation': 1.0},0
3936,10920624,GP,ProteinComplex,,,,Positive_Increase,GP,CDC42,,,,,Activation,SP00000445,50372,{'Manual_Curation': 1.0},0
3939,10920627,GP,ProteinComplex,,,,Positive_Increase,GP,RAC1,,,,,Activation,SP00000445,50372,{'Manual_Curation': 1.0},0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86624,11016831,GP,ProteinComplex,,,,Positive_Increase,GP,ITGB3,,,,,Activation|Indirect_effect,SP00000445,51829,{'Manual_Curation': 1.0},0
86674,11016885,GP,ProteinComplex,,,,Positive_Increase,GP,ProteinComplex,,,,,Activation,SP00000445,51829,{'Manual_Curation': 1.0},0
86675,11016886,GP,ProteinComplex,,,,Positive_Increase,GP,ProteinComplex,,,,,Activation,SP00000445,51829,{'Manual_Curation': 1.0},0
86676,11016887,GP,ProteinComplex,,,,Positive_Increase,GP,ProteinComplex,,,,,Activation,SP00000445,51829,{'Manual_Curation': 1.0},0


In [270]:
# Proteincomplex deletion
print("Length of none_df is " + str(len(none_df['kuid'])))

pc_num = 0

for kuid in none_df['kuid']:
    test_df = none_df[none_df['kuid']==kuid]
    test_df
    if test_df['leftentityid'].values == 'ProteinComplex' or test_df['rightentityid'].values == 'ProteinComplex':
        pc_num += 1
        
print("Total number of Protein Complex rows is " + str(pc_num))

if len(none_df['kuid']) == pc_num:
    print("\n All None is ProteinComplex")

Length of none_df is 15796
Total number of Protein Complex rows is 15796

 All None is ProteinComplex


In [271]:
sum((ppi_df['leftentityid'].notna()) & (ppi_df['rightentityid'].notna()))

578576

In [272]:
sum(ppi_df['leftentityid'].notna())

603061

In [273]:
sum(ppi_df['leftentityid'].isna())

15796

In [274]:
ppi_df_notna = ppi_df[(ppi_df['leftentityid'].notna()) & (ppi_df['rightentityid'].notna())] # ppi dataframe with no NaN

ppi_df_notna = ppi_df_notna.reset_index(drop=True) # index reset

ppi_df_notna

Unnamed: 0,kuid,lefttype,leftentityid,leftorganid,lefttissueid,leftcellid,association,righttype,rightentityid,rightorganid,righttissueid,rightcellid,associationcontext,associationinsource,speciesid,referenceid,evidencescore,level
0,1,GP,5443,,,,Positive_Increase,GP,4988,,,,,Bind,SP00000445,1,{'Manual_Curation': 1.0},0
1,5,GP,3458,,,,Positive_Increase,GP,3459,,,,,Bind,SP00000445,5,{'Manual_Curation': 1.0},0
2,6,GP,3084,,,,Positive_Increase,GP,2066,,,,,Bind,SP00000445,6,{'Manual_Curation': 1.0},0
3,8,GP,3827,,,,Positive_Increase,GP,623,,,,,Bind,SP00000445,8,{'Manual_Curation': 1.0},0
4,13,GP,282617,,,,Positive_Increase,GP,3588,,,,,Bind,SP00000445,9,{'Manual_Curation': 1.0},0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578571,22187918,GP,6689,,,,Directed_Link,GP,3684,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0
578572,22187919,GP,406955,,,,Directed_Link,GP,23411,,,,,Regulate,SP00000445,69207,,0
578573,22187920,GP,1961,,,,Directed_Link,GP,1749,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0
578574,22187921,GP,6667,,,,Directed_Link,GP,2264,,,,,Regulate,SP00000445,69207,{'Manual_Curation': '1'},0


In [275]:
# PDC gene check

ppi_df_notna.loc[(ppi_df_notna['leftentityid'] == '5132') | (ppi_df_notna['rightentityid'] == '5132')]

Unnamed: 0,kuid,lefttype,leftentityid,leftorganid,lefttissueid,leftcellid,association,righttype,rightentityid,rightorganid,righttissueid,rightcellid,associationcontext,associationinsource,speciesid,referenceid,evidencescore,level
165472,12269531,GP,156,,,,Undirected_Link,GP,5132,,,,,Interact,SP00000445,57206,{'Manual_Curation': 1.0},0
211517,12315576,GP,5132,,,,Undirected_Link,GP,5705,,,,,Interact,SP00000445,63758,{'Manual_Curation': 1.0},0
292828,16503360,GP,1406,,,,Directed_Link,GP,5132,,,,,,SP00000445,69007,{'Discrete_Level': 'High'},0
297822,16508354,GP,1870,,,,Directed_Link,GP,5132,,,,,,SP00000445,69013,{'Discrete_Level': 'High'},0
414894,16625426,GP,8092,,,,Directed_Link,GP,5132,,,,,,SP00000445,69012,{'Discrete_Level': 'High'},0


In [276]:
import json

with open('kegg/ko00001.json', 'r') as f:
    json_data = json.load(f)

In [279]:
get_gene_id('pDc')

'5132'

In [283]:
first_list = json_data['children'] # metabolism # Carbohydrate metabolism # Glycolysis

total_gene = []

for i in range(len(first_list)):
    for j in range(len(first_list[i]['children'][j])):
        for k in range(len(first_list[i]['children'][j]['children'])):
            if 'children' in first_list[i]['children'][j]['children'][k]:
                for l in range(len(first_list[i]['children'][j]['children'][k]['children'])):
                    #print(first_list[i]['children'][j]['children'][k]['children'][l])
                    ref = first_list[i]['children'][j]['children'][k]['children'][l]['name']
                    print(ref)
                    #ref2 = ref.split('  ')
                    #ref3 = ref2[1].split(';')
                    #total_gene.append(ref3[0])

K00844  HK; hexokinase [EC:2.7.1.1]
K12407  GCK; glucokinase [EC:2.7.1.2]
K00845  glk; glucokinase [EC:2.7.1.2]
K25026  glk; glucokinase [EC:2.7.1.2]
K01810  GPI, pgi; glucose-6-phosphate isomerase [EC:5.3.1.9]
K06859  pgi1; glucose-6-phosphate isomerase, archaeal [EC:5.3.1.9]
K13810  tal-pgi; transaldolase / glucose-6-phosphate isomerase [EC:2.2.1.2 5.3.1.9]
K15916  pgi-pmi; glucose/mannose-6-phosphate isomerase [EC:5.3.1.9 5.3.1.8]
K24182  PFK9; 6-phosphofructokinase [EC:2.7.1.11]
K00850  pfkA, PFK; 6-phosphofructokinase 1 [EC:2.7.1.11]
K16370  pfkB; 6-phosphofructokinase 2 [EC:2.7.1.11]
K21071  pfk, pfp; ATP-dependent phosphofructokinase / diphosphate-dependent phosphofructokinase [EC:2.7.1.11 2.7.1.90]
K00918  pfkC; ADP-dependent phosphofructokinase/glucokinase [EC:2.7.1.146 2.7.1.147]
K00895  pfp, PFP; diphosphate-dependent phosphofructokinase [EC:2.7.1.90]
K03841  FBP, fbp; fructose-1,6-bisphosphatase I [EC:3.1.3.11]
K02446  glpX; fructose-1,6-bisphosphatase II [EC:3.1.3.11]
K115

In [282]:
total_gene

['HK',
 'GCK',
 'glk',
 'glk',
 'GPI, pgi',
 'pgi1',
 'tal-pgi',
 'pgi-pmi',
 'PFK9',
 'pfkA, PFK',
 'pfkB',
 'pfk, pfp',
 'pfkC',
 'pfp, PFP',
 'FBP, fbp',
 'glpX',
 'glpX-SEBP',
 'fbp-SEBP',
 'fbp3',
 'ALDO',
 'fbaB',
 'FBA, fbaA',
 'K01622',
 'K16305',
 'K16306',
 'TPI, tpiA',
 'GAPDH, gapA',
 'GAPDHS',
 'gap2',
 'PGK, pgk',
 'PGAM, gpmA',
 'gpmB',
 'gpmI',
 'apgM',
 'ENO, eno',
 'PK, pyk',
 'PKLR',
 'ppdK',
 'pps, ppsA',
 'aceE',
 'PDHA, pdhA',
 'PDHB, pdhB',
 'DLAT, aceF, pdhC',
 'DLD, lpd, pdhD',
 'PDHX',
 'porA',
 'porB',
 'porC, porG',
 'vorG, porG',
 'porD',
 'por, nifJ',
 'korA, oorA, oforA',
 'korB, oorB, oforB',
 'LDH, ldh',
 'PDC, pdc',
 'ADH1_7',
 'ADH4',
 'frmA, ADH5, adhC',
 'ADH6',
 'adhE',
 'adhP',
 'yiaY',
 'ADH1',
 'E1.1.1.1, adh',
 'AKR1A1, adh',
 'ahr',
 'yahK',
 'mdh1, mxaF',
 'mdh2, mxaI',
 'exaA',
 'eutG',
 'adhA',
 'adhB',
 'ALDH',
 'ALDH7A1',
 'ALDH9A1',
 'ALDH3',
 'aldB',
 'ACSS1_2, acs',
 'AAE7, ACN1',
 'acdA',
 'acdB',
 'acdAB',
 'galM, GALM',
 'agp',
 'yi

In [130]:
total_gene_final = [] # total genes in KEGG pathway
for gene in total_gene:
    for element in gene.split(', '):
        total_gene_final.append(element)

In [145]:
ind = 0

for gene in list(set(total_gene_final)):
    if gene.upper() in ppi_g.nodes:
        ind += 1

In [146]:
print("The number of total genes in KEGG network : " + str(len(list(set(total_gene_final)))))

The number of total genes in KEGG network : 20629


In [147]:
print("The number of genes that exist both in CODA & KEGG : " + str(ind) + ' / ' + str(len(ppi_g.nodes)))

The number of genes that exist both in CODA & KEGG : 7452 / 22488


How many CODA interactions exist that are in KEGG pathway?

In [148]:
for gene in list(set(total_gene_final))[:20]:
    print(gene.upper())

QOXB
RNF110
BMOX
PATJ
MAP7D1
HIF1
MALT
IPMK
GGUB
SPT4
CHMT
E3.2.1.59
BMYC
FOLD
STZF
MAPKBP1
PTK2
SUN3
ZBTB26
NARJ
