In [106]:
# !pip install pandas

### Get isolate ids

In [2]:
import pandas as pd

df_idb_bacteria = pd.read_csv("../data/bacteria/IDB_interesting_bacteria.csv")
df_idb_bacteria.head()

Unnamed: 0,ID,bacteria,isolate,link,Tax ID
0,90,Bacteria;Firmicutes;Clostridia;Clostridiales;L...,,https://www.genome.jp/dbget-bin/www_bfind_sub?...,
1,90,Bacteria;Firmicutes;Clostridia;Clostridiales;L...,T02583,https://www.genome.jp/entry/gn:T02583,751585.0
2,90,Bacteria;Firmicutes;Clostridia;Clostridiales;L...,T02584,https://www.genome.jp/entry/gn:T02584,717962.0
3,90,Bacteria;Firmicutes;Clostridia;Clostridiales;L...,T07118,https://www.genome.jp/entry/gn:T07118,410072.0
4,90,Bacteria;Firmicutes;Clostridia;Clostridiales;L...,T07943,https://www.genome.jp/entry/gn:T07943,411474.0


In [3]:
# get isolate ids
isolate_list = df_idb_bacteria.isolate.dropna().tolist()
print(len(isolate_list))
print(isolate_list)

23
['T02583', 'T02584', 'T07118', 'T07943', 'T01387', 'T02600', 'T02601', 'T02606', 'T03158', 'T05998', 'T08490', 'T08757', 'T00835', 'T07161', 'T08965', 'T03807', 'T05838', 'T06636', 'T09392', 'T07161', 'T08965', 'T04880', 'T08068']


### Get mappings kegg geneid - uniprotid

In [4]:
# helper function for spliting api response
# cce:Ccel_0001\nup:B8I3R2\n
# cce:Ccel_0002\tup:B8I3R3\n
        
def split_ids(s):
    mappings = {}
    lines = s.split("\n")
    for l in lines:
        ids = l.split("\t")
        if len(ids) == 2:
            mappings[ids[0]] = ids[1][3:]
        else:
            pass
#             print("Error:", ids)
    return mappings

In [6]:
import requests

base_id_list = []

# Retrieve mapped uniprot ids by isolate & gene
for i in isolate_list:    
    # URL example: https://rest.kegg.jp/conv/uniprot/T00835
    url = 'https://rest.kegg.jp/conv/uniprot/' + i
    print(url)
    response = requests.get(url)
    id_list = split_ids(response.text)
    base_id_list.append(id_list)
print(len(base_id_list))

https://rest.kegg.jp/conv/uniprot/T02583
https://rest.kegg.jp/conv/uniprot/T02584
https://rest.kegg.jp/conv/uniprot/T07118
https://rest.kegg.jp/conv/uniprot/T07943
https://rest.kegg.jp/conv/uniprot/T01387
https://rest.kegg.jp/conv/uniprot/T02600
https://rest.kegg.jp/conv/uniprot/T02601
https://rest.kegg.jp/conv/uniprot/T02606
https://rest.kegg.jp/conv/uniprot/T03158
https://rest.kegg.jp/conv/uniprot/T05998
https://rest.kegg.jp/conv/uniprot/T08490
https://rest.kegg.jp/conv/uniprot/T08757
https://rest.kegg.jp/conv/uniprot/T00835
https://rest.kegg.jp/conv/uniprot/T07161
https://rest.kegg.jp/conv/uniprot/T08965
https://rest.kegg.jp/conv/uniprot/T03807
https://rest.kegg.jp/conv/uniprot/T05838
https://rest.kegg.jp/conv/uniprot/T06636
https://rest.kegg.jp/conv/uniprot/T09392
https://rest.kegg.jp/conv/uniprot/T07161
https://rest.kegg.jp/conv/uniprot/T08965
https://rest.kegg.jp/conv/uniprot/T04880
https://rest.kegg.jp/conv/uniprot/T08068
23


In [11]:
# Merge all the geneid - uniprotid lists for filling in spreadsheet

def merge_dict(dict1, dict2):
    res = dict1 | dict2
    return res

combined_id_list = base_id_list[0]
for i in range(1, len(base_id_list) - 1):
    combined_id_list = merge_dict(combined_id_list, base_id_list[i])

len(combined_id_list)

30759

In [12]:
# Convert API output response to Pandas DataFrame

def split_to_pd(s, isolate):
    lines = s.split("\n")
    all_lines = [aline.split("\t") for aline in lines]
    df_gene_list = pd.DataFrame(all_lines, columns=['A', 'B', 'C', 'D'])
    df_gene_list["isolate"] = isolate
    return df_gene_list

### Get a list of genes with isolate id with Kegg REST API

In [13]:
import requests

all_genes_list = []

# for i in isolate_list[0:2]:
for i in isolate_list:
    url = 'https://rest.kegg.jp/list/' + i
    gene_list = requests.get(url)
    print(url, gene_list)
    all_genes_list.append(split_to_pd(gene_list.text, i))
#print(response_API.status_code)

https://rest.kegg.jp/list/T02583 <Response [200]>
https://rest.kegg.jp/list/T02584 <Response [200]>
https://rest.kegg.jp/list/T07118 <Response [200]>
https://rest.kegg.jp/list/T07943 <Response [200]>
https://rest.kegg.jp/list/T01387 <Response [200]>
https://rest.kegg.jp/list/T02600 <Response [200]>
https://rest.kegg.jp/list/T02601 <Response [200]>
https://rest.kegg.jp/list/T02606 <Response [200]>
https://rest.kegg.jp/list/T03158 <Response [200]>
https://rest.kegg.jp/list/T05998 <Response [200]>
https://rest.kegg.jp/list/T08490 <Response [200]>
https://rest.kegg.jp/list/T08757 <Response [200]>
https://rest.kegg.jp/list/T00835 <Response [200]>
https://rest.kegg.jp/list/T07161 <Response [200]>
https://rest.kegg.jp/list/T08965 <Response [200]>
https://rest.kegg.jp/list/T03807 <Response [200]>
https://rest.kegg.jp/list/T05838 <Response [200]>
https://rest.kegg.jp/list/T06636 <Response [200]>
https://rest.kegg.jp/list/T09392 <Response [200]>
https://rest.kegg.jp/list/T07161 <Response [200]>


In [15]:
all_genes_list[0].head()

Unnamed: 0,A,B,C,D,isolate
0,coo:CCU_T_29320,tRNA,60..130,CCU_T_29320,T02583
1,coo:CCU_00100,CDS,241..318,hypothetical protein,T02583
2,coo:CCU_00110,CDS,432..1544,DNA repair exonuclease,T02583
3,coo:CCU_00120,CDS,1547..3220,hypothetical protein,T02583
4,coo:CCU_00140,CDS,5775..5873,hypothetical protein,T02583


In [16]:
# Combine all the genes list
all_genes_df = pd.concat(all_genes_list, axis=0)
# set an empty column for uniprod_id
all_genes_df["uniprod_id"] = ""
all_genes_df.head()

Unnamed: 0,A,B,C,D,isolate,uniprod_id
0,coo:CCU_T_29320,tRNA,60..130,CCU_T_29320,T02583,
1,coo:CCU_00100,CDS,241..318,hypothetical protein,T02583,
2,coo:CCU_00110,CDS,432..1544,DNA repair exonuclease,T02583,
3,coo:CCU_00120,CDS,1547..3220,hypothetical protein,T02583,
4,coo:CCU_00140,CDS,5775..5873,hypothetical protein,T02583,


In [17]:
# Fill in uniprod_id column referencing combined_id_list
for ind, row in all_genes_df.iterrows():
    row["uniprod_id"] = combined_id_list.get(row["A"])

In [18]:
all_genes_df.head(10)

Unnamed: 0,A,B,C,D,isolate,uniprod_id
0,coo:CCU_T_29320,tRNA,60..130,CCU_T_29320,T02583,
1,coo:CCU_00100,CDS,241..318,hypothetical protein,T02583,D5HGS1
2,coo:CCU_00110,CDS,432..1544,DNA repair exonuclease,T02583,D5HGS2
3,coo:CCU_00120,CDS,1547..3220,hypothetical protein,T02583,D5HGS3
4,coo:CCU_00140,CDS,5775..5873,hypothetical protein,T02583,D5HGS4
5,coo:CCU_00150,CDS,complement(5914..7851),ATPase components of ABC transporters with dup...,T02583,D5HGS5
6,coo:CCU_00160,CDS,8090..9886,Uncharacterized protein conserved in bacteria,T02583,D5HGS6
7,coo:CCU_00170,CDS,9905..10954,Membrane-associated lipoprotein involved in th...,T02583,D5HGS7
8,coo:CCU_00180,CDS,10964..11332,Uncharacterized protein conserved in bacteria,T02583,D5HGS8
9,coo:CCU_00190,CDS,11322..11879,Predicted membrane protein,T02583,D5HGS9


In [19]:
all_genes_df.to_csv("../data/gene_id_with_uniprodid.csv", index=False)

In [20]:
all_genes_df.uniprod_id.to_csv("../data/uniprotid.csv", index=False)

In [21]:
len(list(all_genes_df.A))

67319

In [22]:
len(list(set(list(all_genes_df.A))))

58987

In [8]:
def find_intersect(data_1, data_2):
    intersect = list(set(data_1) & set(data_2))
    return intersect

def find_union(data_1, data_2):
    union = list(set(data_1) | set(data_2))
    return union

In [9]:
base = list()
for i in range(len(all_genes_list)):
    base = find_union(base, all_genes_list[i].D.tolist())
len(base)

11690

In [11]:
base = all_genes_list[0].D.tolist()

for i in range(1, len(all_genes_list) -1):
    temp_intersection = find_intersect(all_genes_list[i].D.tolist(), all_genes_list[i+1].D.tolist())

    #     print(temp_intersection)
    base = find_intersect(base, temp_intersection)
    print(base)

    print(i, i+1, len(base))

['dephospho-CoA kinase', 'UDP-N-acetylmuramoyl-tripeptide--D-alanyl-D-alanine ligase', 'methylglyoxal synthase', 'shikimate dehydrogenase', 'argininosuccinate synthase', 'peptide deformylase', 'transcriptional regulator', 'adenine phosphoribosyltransferase', 'acetylglutamate kinase', 'anti-sigma F factor', 'trigger factor', 'amidophosphoribosyltransferase', 'glutamate-5-semialdehyde dehydrogenase', "deoxyuridine 5'-triphosphate nucleotidohydrolase", 'DNA gyrase subunit B', 'ACT domain-containing protein', 'methionine adenosyltransferase', 'guanylate kinase', 'hypothetical protein', None, 'thioredoxin', 'D-tyrosyl-tRNA(Tyr) deacylase', 'CTP synthase', 'N-acetylmuramoyl-L-alanine amidase', 'SAM-dependent methyltransferase', 'dihydroxy-acid dehydratase', '2,3-bisphosphoglycerate-independent phosphoglycerate mutase', 'UDP-N-acetylmuramate--L-alanine ligase', 'adenylosuccinate lyase', 'cysteine desulfurase', 'threonine synthase', 'glucose-6-phosphate isomerase', '6,7-dimethyl-8-ribityllumaz

In [18]:
import pandas

df = pd.read_csv('../data/bacteria/13059_2022_2643_MOESM7_ESM.csv')
bac_genes = df.bacterial_gene_names
bac_genes = list(set(bac_genes))
bac_genes

["['uxuA YPK_2778' 'uxuA Tola_2884' nan]",
 "['gap gapC CA_C0709' 'gapA gap MG301' nan]",
 "['aroE YE3884' 'aroE EcSMS35_3577' nan]",
 "['glpK LCA_0649' 'glpK BcerKBAB4_0948' nan]",
 "['FAEPRAA2165_01342' 'CRH10_03100' nan]",
 "['ppaC BAA_2888' 'ppaC BPUM_3708' nan]",
 "['nusA c3926' 'nusA Z4530 ECs4050' nan]",
 "['astD EcE24377A_1968' 'astD CPS_0634' nan]",
 "['trhP1 yrrO BSU27340' 'SAMN05878482_101721' nan]",
 "['pgcA gtaC gtaE yhxB BSU09310' 'SC09_Contig19orf00130' nan]",
 "['SAMN02799616_04514' 'DN390_09215' nan]",
 "['pntA b1603 JW1595' 'A1U5_01956' nan]",
 "['fklB VME_16270' 'fklB XSR1_90091' nan]",
 "['atpD BMA10229_A1587' 'atpD Bcep1808_0115' nan]",
 "['metE BCB4264_A4108' 'metE BCAH820_4021' nan]",
 "['gltX Xfasm12_2028' 'gltX YPDSF_2092' nan]",
 "['KS08_01620' 'D9V42_07860' nan]",
 "['eno SUB0655' 'eno EAT1b_0860' nan]",
 "['fib efb MW1040' 'fib efb SAS1091' nan]",
 "['AT03_04030' 'alcB YPH_3898' nan]",
 "['clpB BC_1168' 'clpB CPE1428' nan]",
 "['HV084_25475' 'HV269_27150' na

In [40]:
base_list = []
for bac in bac_genes:
    s = bac.strip('[]')[1:-5]
    base_list = base_list + s.split("' '")
    base_list = list(set(base_list))

    print(len(base_list), base_list) 

66 ['astD EcE24377A_1968', 'rpoN glnF ntrA STM3320', 'pgcA gtaC gtaE yhxB BSU09310', 'clpB CPE1428', 'SC09_Contig19orf00130', 'nrdB BUsg_172', 'CRH10_03100', 'glpK BcerKBAB4_0948', 'A1U5_01956', 'nrdB STM2278', 'FEZ36_06495', 'fklB VME_16270', 'groL4 groEL4 RA0395 SMa0744', 'DPV99_00370', 'nusA Z4530 ECs4050', 'fib efb SAS1091', 'SAMN05878482_101721', 'fib efb MW1040', 'alcB YPH_3898', 'rlmN ECUMN_2837', 'astD CPS_0634', 'clpB BC_1168', 'eno SUB0655', 'AT03_04030', 'metE BCAH820_4021', 'uxuA Tola_2884', 'gap gapC CA_C0709', 'nusA c3926', 'metE BCB4264_A4108', 'gltX Xfasm12_2028', 'HV084_25475', 'uvrC Shew_1825', 'DOL88_03480', 'SAMN02799616_04514', 'groL4 groEL4 BBta_5285', 'HV269_27150', 'ppaC BPUM_3708', 'uxuA YPK_2778', 'atpD Bcep1808_0115', 'secA Rmag_0013', 'CQA85_03380', 'ppaC BAA_2888', 'pntA b1603 JW1595', 'D9V42_07860', 'rpoN ntrA', 'eno Psyc_1636', 'uvrC ECIAI39_1142', 'gapA gap MG301', 'DN390_09215', 'trhP1 yrrO BSU27340', 'KS08_01620', 'FAEPRAA2165_01342', 'aroE YE3884', 'e

In [41]:
base_list

['astD EcE24377A_1968',
 'rpoN glnF ntrA STM3320',
 'pgcA gtaC gtaE yhxB BSU09310',
 'clpB CPE1428',
 'SC09_Contig19orf00130',
 'nrdB BUsg_172',
 'CRH10_03100',
 'glpK BcerKBAB4_0948',
 'A1U5_01956',
 'nrdB STM2278',
 'FEZ36_06495',
 'fklB VME_16270',
 'groL4 groEL4 RA0395 SMa0744',
 'DPV99_00370',
 'nusA Z4530 ECs4050',
 'fib efb SAS1091',
 'SAMN05878482_101721',
 'fib efb MW1040',
 'alcB YPH_3898',
 'rlmN ECUMN_2837',
 'astD CPS_0634',
 'clpB BC_1168',
 'eno SUB0655',
 'AT03_04030',
 'metE BCAH820_4021',
 'uxuA Tola_2884',
 'gap gapC CA_C0709',
 'nusA c3926',
 'metE BCB4264_A4108',
 'gltX Xfasm12_2028',
 'HV084_25475',
 'uvrC Shew_1825',
 'DOL88_03480',
 'SAMN02799616_04514',
 'groL4 groEL4 BBta_5285',
 'HV269_27150',
 'ppaC BPUM_3708',
 'uxuA YPK_2778',
 'atpD Bcep1808_0115',
 'secA Rmag_0013',
 'CQA85_03380',
 'ppaC BAA_2888',
 'pntA b1603 JW1595',
 'D9V42_07860',
 'rpoN ntrA',
 'eno Psyc_1636',
 'uvrC ECIAI39_1142',
 'gapA gap MG301',
 'DN390_09215',
 'trhP1 yrrO BSU27340',
 'KS08