# Stage 3: Preprocessing from Bacterial Proteomics
These stage is to format the proteomics data to annotate the gene derivate name and other swissprot information.
- __Author__ : Cancer Molecular Dynamics Laboratory
- __Version__: 1.0

* Volcano plot 
* Up-genes list 
* Down-genes list# Work with MetaCyc

In [1]:
import pandas as pd
df = pd.read_csv("../data/Bacteria.tsv", sep='\t', skiprows=1)
df

Unnamed: 0,ID,proteína,bacteria,pvalue,log2FoldChange
0,SEQF1078_00060,Elongation factor Tu,Fusobacterium nucleatum,1.123520e-04,6.000000e+09
1,SEQF1160_00952,Elongation factor Tu,Fusobacterium periodonticum,1.123520e-04,6.000000e+09
2,SEQF2067_00702,Elongation factor Tu,Fusobacterium nucleatum,1.123520e-04,6.000000e+09
3,SEQF2555_02146,Elongation factor Tu,Fusobacterium periodonticum,1.123520e-04,6.000000e+09
4,SEQF2821_00166,Elongation factor Tu,Fusobacterium nucleatum,1.123520e-04,6.000000e+09
...,...,...,...,...,...
1873,SEQF2812_01596,NAD-specific glutamate dehydrogenase,Fusobacterium nucleatum,8.080000e-11,8.000000e+08
1874,SEQF2816_00970,NAD-specific glutamate dehydrogenase,Fusobacterium nucleatum,8.350000e-10,7.000000e+08
1875,SEQF2843_00390,NAD-specific glutamate dehydrogenase,Fusobacterium hwasookii,8.350000e-10,7.000000e+08
1876,SEQF3018_01721,Enolase,Fusobacterium nucleatum,8.990000e-05,4.000000e+09


In [2]:
import pandas as pd
#https://www.homd.org/ftp//genomes/PROKKA/V10.1/tsv/ALL_genomes.tsv # 1.5Gb
# For production works replace data/SEQF10000.1.tsv for data/ALL_genomes.tsv
#For github and test we used https://www.homd.org/ftp//genomes/PROKKA/V10.1/tsv/SEQF10000.1.tsv
df2 = pd.read_csv("../data/HOMD/SEQF10000.1.tsv", sep='\t', skiprows=0)
df2

Unnamed: 0,locus_tag,ftype,length_bp,gene,EC_number,COG,product
0,SEQF10000.1_00001,CDS,1314,,,,hypothetical protein
1,SEQF10000.1_00002,CDS,1560,carA_1,2.8.3.23,COG4670,Caffeate CoA-transferase
2,SEQF10000.1_00003,CDS,786,,4.2.1.150,COG1024,Crotonyl-CoA hydratase
3,SEQF10000.1_00004,CDS,456,phaJ_1,4.2.1.119,COG2030,(R)-specific enoyl-CoA hydratase
4,SEQF10000.1_00005,CDS,1038,carE_1,1.3.1.108,COG2025,Caffeyl-CoA reductase-Etf complex subunit CarE
...,...,...,...,...,...,...,...
23951613,SEQF9999.1_02047,CDS,273,,,,hypothetical protein
23951614,SEQF9999.1_02048,CDS,393,,,,hypothetical protein
23951615,SEQF9999.1_02049,CDS,132,,,,hypothetical protein
23951616,SEQF9999.1_02050,CDS,579,,,,hypothetical protein


In [3]:
df2.dropna(subset=['locus_tag'], inplace=True)

In [4]:
IDs = df['ID'].tolist()
IDs = [(x[:8]+'.1'+x[8:]) for x in IDs]
df['NEW_IDS']= IDs
IDs[:3]

['SEQF1078.1_00060', 'SEQF1160.1_00952', 'SEQF2067.1_00702']

In [5]:
df3 = df2.loc[df2['locus_tag'].isin(IDs)]

In [6]:
# camino 1: Usar el producto para buscar los homoloogos en Ecoli 511145
# descarta ya que sólo hay 425 nombres únicos
print(len(df3['product'].unique()))
df3

425


Unnamed: 0,locus_tag,ftype,length_bp,gene,EC_number,COG,product
653349,SEQF1025.1_02508,CDS,810,punA,2.4.2.1,,Purine nucleoside phosphorylase 1
668065,SEQF1032.1_02443,CDS,1068,,1.-.-.-,COG0667,putative oxidoreductase
678159,SEQF1035.1_02385,CDS,1971,htpG,,COG0326,Chaperone protein HtpG
756984,SEQF1074.1_01232,CDS,1113,hypD,,COG0409,Hydrogenase maturation factor HypD
757938,SEQF1075.1_00055,CDS,1575,nikA_1,,COG0747,Nickel-binding protein NikA
...,...,...,...,...,...,...,...
5895014,SEQF3674.1_01362,CDS,1263,proA,1.2.1.41,COG0014,Gamma-glutamyl phosphate reductase
5895060,SEQF3674.1_01408,CDS,525,,,,hypothetical protein
5896285,SEQF3675.1_00774,CDS,771,,,,putative ABC transporter ATP-binding protein
5896335,SEQF3675.1_00824,CDS,1224,rodA,2.4.1.129,COG0772,Peptidoglycan glycosyltransferase RodA


In [7]:
df31 = df.merge(right=df3, left_on='NEW_IDS', right_on='locus_tag')
df31

Unnamed: 0,ID,proteína,bacteria,pvalue,log2FoldChange,NEW_IDS,locus_tag,ftype,length_bp,gene,EC_number,COG,product
0,SEQF1078_00060,Elongation factor Tu,Fusobacterium nucleatum,1.123520e-04,6.000000e+09,SEQF1078.1_00060,SEQF1078.1_00060,CDS,1185,tufA,,COG0050,Elongation factor Tu
1,SEQF1160_00952,Elongation factor Tu,Fusobacterium periodonticum,1.123520e-04,6.000000e+09,SEQF1160.1_00952,SEQF1160.1_00952,CDS,2082,fusA_1,,COG0480,Elongation factor G
2,SEQF2067_00702,Elongation factor Tu,Fusobacterium nucleatum,1.123520e-04,6.000000e+09,SEQF2067.1_00702,SEQF2067.1_00702,CDS,1185,tufA,,COG0050,Elongation factor Tu
3,SEQF2555_02146,Elongation factor Tu,Fusobacterium periodonticum,1.123520e-04,6.000000e+09,SEQF2555.1_02146,SEQF2555.1_02146,CDS,1185,tufA,,COG0050,Elongation factor Tu
4,SEQF2821_00166,Elongation factor Tu,Fusobacterium nucleatum,1.123520e-04,6.000000e+09,SEQF2821.1_00166,SEQF2821.1_00166,CDS,1185,tufA,,COG0050,Elongation factor Tu
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1619,SEQF2812_01596,NAD-specific glutamate dehydrogenase,Fusobacterium nucleatum,8.080000e-11,8.000000e+08,SEQF2812.1_01596,SEQF2812.1_01596,CDS,468,,,,hypothetical protein
1620,SEQF2816_00970,NAD-specific glutamate dehydrogenase,Fusobacterium nucleatum,8.350000e-10,7.000000e+08,SEQF2816.1_00970,SEQF2816.1_00970,CDS,774,,,COG0327,GTP cyclohydrolase 1 type 2
1621,SEQF2843_00390,NAD-specific glutamate dehydrogenase,Fusobacterium hwasookii,8.350000e-10,7.000000e+08,SEQF2843.1_00390,SEQF2843.1_00390,CDS,1278,,1.4.1.2,,NAD-specific glutamate dehydrogenase
1622,SEQF3018_01721,Enolase,Fusobacterium nucleatum,8.990000e-05,4.000000e+09,SEQF3018.1_01721,SEQF3018.1_01721,tRNA,75,,,,tRNA-Gln(ttg)


In [8]:
new_genes = []
for gen in df31['gene']:
    try:gen=gen.split("_")[0]
    except: pass
    new_genes.append(gen)
df31['New Genes'] = new_genes

In [9]:
len(df31['gene'])

1624

In [10]:
genes = df31['gene'].dropna().unique().tolist()
print(len(genes))
print(genes[:60])

424
['tufA', 'fusA_1', 'rpsG', 'yaaA', 'eno', 'kdsD', 'punA', 'deoC_1', 'deoA', 'mglA_6', 'ttr', 'yqeN', 'pyrDA', 'yknX', 'atpH', 'atpB', 'iscU', 'atpA', 'atpC', 'yxeN', 'trmD', 'ogt', 'yfeW', 'mgsR', 'artQ', 'proA', 'rpsT', 'rnfC_1', 'nusA', 'ahpC', 'mltG', 'tpl', 'creD', 'murR', 'rpsO', 'pal_2', 'slyA_2', 'pal_1', 'pal_3', 'pgi', 'pal_5', 'pal_9', 'yjgH', 'pgrR_1', 'prpL', 'ywnA', 'yfkM', 'yraA_2', 'fusA_2', 'rpsL', 'lysS', 'miaB', 'abgT_1', 'fldA_1', 'yfiC_1', 'carD_1', 'bcd', 'carE_2', 'fixB', 'mutS_2']


In [11]:
# All gene curate from the proteomics are considered up, beacuse the number the bacteria in control is very scare
new_genes = []
for gen in genes:
    try:gen=gen.split("_")[0]
    except: pass
    new_genes.append(gen)
new_genes = list(set(new_genes))
print(len(genes))
print(len(new_genes))
print(new_genes)

424
363
['fda', 'rqcH', 'argO', 'nusG', 'hgdC', 'proC', 'resA', 'fabD', 'lysO', 'atoE', 'ldh', 'murR', 'ptsH', 'asnA', 'valS', 'lon1', 'psiE', 'ahcY', 'hemH', 'metF', 'cobB', 'miaB', 'tdcB', 'yaaA', 'gcdA', 'yknY', 'slyA', 'rplA', 'lptF', 'aroC', 'hbpA', 'frr', 'potB', 'kce', 'crt', 'hbd', 'eutD', 'slyD', 'fpuC', 'hypD', 'lemA', 'prpL', 'yfkN', 'mutS2', 'kdd', 'atpH', 'hypE', 'iolG', 'rpmF', 'atoC', 'rodA', 'glcK', 'yclM', 'zraP', 'gctA', 'yutF', 'mdh', 'clpC', 'ackA', 'glmE', 'oleD', 'yclN', 'lepB', 'gctB', 'etfA', 'rpsP', 'glpR', 'accA', 'hutU', 'lplJ', 'shlB', 'hcp', 'aroB', 'purB', 'rsgA', 'mglB', 'fdhF', 'pyrH', 'pilQ', 'tabA', 'clpX', 'ugpC', 'lpxK', 'thiI', 'queE', 'viaA', 'nadB', 'fmt', 'fabG', 'kdsC', 'yigL', 'apbE', 'pstB3', 'ypdA', 'fabH', 'mlaA', 'garB', 'btuB', 'gcdC', 'atoD', 'pykF', 'yjgH', 'ctfB', 'rplK', 'acdA', 'rbr3A', 'fixB', 'ywrO', 'pyrD', 'rsxE', 'yjjP', 'birA', 'tufA', 'dnaE', 'tenA', 'gsiC', 'sctC', 'iclR', 'znuA', 'pgcA', 'tyrS', 'tnaA1', 'leuC', 'atpB', 'lgt'