In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
deseq2_table_path = "../../results/rsem/"
files = [] 
for file in os.listdir(deseq2_table_path):
    if file.endswith(".csv"):
        files.append(deseq2_table_path + file)

# Curvibacter genome file parsing for WP to AEP conversion and transcript descriptions

In [3]:
gff = pd.read_table("../../data/curvibacter_genome/GCF_002163715.1_ASM216371v1_genomic.gff", delimiter="\t", skiprows=9, header=None)

In [4]:
gff.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NZ_CP015698.1,RefSeq,region,1.0,4367400.0,.,+,.,ID=NZ_CP015698.1:1..4367400;Dbxref=taxon:18449...
1,NZ_CP015698.1,RefSeq,gene,108.0,431.0,.,-,.,ID=gene-AEP_RS00005;Name=AEP_RS00005;gbkey=Gen...
2,NZ_CP015698.1,Protein Homology,CDS,108.0,431.0,.,-,0,ID=cds-WP_087493495.1;Parent=gene-AEP_RS00005;...
3,NZ_CP015698.1,RefSeq,gene,563.0,2617.0,.,-,.,ID=gene-AEP_RS00010;Name=AEP_RS00010;gbkey=Gen...
4,NZ_CP015698.1,Protein Homology,CDS,563.0,2617.0,.,-,0,ID=cds-WP_087493496.1;Parent=gene-AEP_RS00010;...


In [5]:
columns = ["seqname","source","feature","start","end","score","strand","frame","attribute"]
gff.columns = columns
gff = gff.dropna()
gff.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,NZ_CP015698.1,RefSeq,region,1.0,4367400.0,.,+,.,ID=NZ_CP015698.1:1..4367400;Dbxref=taxon:18449...
1,NZ_CP015698.1,RefSeq,gene,108.0,431.0,.,-,.,ID=gene-AEP_RS00005;Name=AEP_RS00005;gbkey=Gen...
2,NZ_CP015698.1,Protein Homology,CDS,108.0,431.0,.,-,0,ID=cds-WP_087493495.1;Parent=gene-AEP_RS00005;...
3,NZ_CP015698.1,RefSeq,gene,563.0,2617.0,.,-,.,ID=gene-AEP_RS00010;Name=AEP_RS00010;gbkey=Gen...
4,NZ_CP015698.1,Protein Homology,CDS,563.0,2617.0,.,-,0,ID=cds-WP_087493496.1;Parent=gene-AEP_RS00010;...


In [6]:
# fill lists with genomic information
locus_tag = []
wp_number = []
description = []
ontology_terms = []
ontology_process = []
for feature,transcript in zip(gff.feature,gff.attribute):
    if feature == 'CDS':
        tmp = transcript.split(";")
        switch_go = 0
        switch_process = 0
        for att in tmp:

            if att.startswith("ID"):
                wp_number.append(att.split("cds-")[1])
            elif att.startswith("Parent"):
                locus_tag.append(att.split("=gene-")[1])
            elif att.startswith("product="):
                description.append(att.split("=")[1])
            
            elif att.startswith("Ontology_term="):
                ontology_terms.append(att.split("Ontology_term=")[1])
                switch_go = 1
            elif att.startswith("go_process="):
                ontology_process.append(att.split("go_process=")[1])
                switch_process = 1
                
        if switch_go == 0:
            ontology_terms.append("unknown")
        if switch_process == 0:
            ontology_process.append("unknown")

            
old_locus_tag = []
for tag in locus_tag:
    tag_switch = 0
    for feature,transcript in zip(gff.feature,gff.attribute):
        if feature == 'gene':
            switch = 0
            tmp = transcript.split(";")
            for att in tmp:
                if att.startswith('locus_tag='):
                    tmp_tag = att.split("locus_tag=")[1]
                    if tmp_tag == tag:
                        switch = 1
                        
            if switch == 1:
                for att in tmp:
                    if att.startswith('old_locus_tag'):
                        tmp_tag = att.split("old_locus_tag=")[1]
                        old_locus_tag.append(tmp_tag)
                        tag_switch = 1
    if tag_switch == 0:
        old_locus_tag.append("unknown")

In [7]:
data_columns = [locus_tag,old_locus_tag,wp_number,description,ontology_terms,ontology_process]
curvibacter_genes_df = pd.DataFrame(data_columns).transpose()
curvibacter_genes_df.columns = ['locus_tag','gene_id','wp_number','description','GO','GO_process']
curvibacter_genes_df.head()

Unnamed: 0,locus_tag,gene_id,wp_number,description,GO,GO_process
0,AEP_RS00005,AEP_00001,WP_087493495.1,response regulator transcription factor,"GO:0000160,GO:0006355,GO:0003677",phosphorelay signal transduction system|000016...
1,AEP_RS00010,AEP_00002,WP_087493496.1,sodium-translocating pyrophosphatase,"GO:1902600,GO:0009678",proton transmembrane transport|1902600||IEA
2,AEP_RS00015,AEP_00003,WP_087493497.1,inorganic diphosphatase,"GO:0006796,GO:0004427",phosphate-containing compound metabolic proces...
3,AEP_RS00020,AEP_00004,WP_087493498.1,alpha/beta fold hydrolase,unknown,unknown
4,AEP_RS00025,AEP_00005,WP_087493499.1,chemotaxis protein CheW,"GO:0006935,GO:0007165","chemotaxis|0006935||IEA,signal transduction|00..."


In [8]:
curvibacter_genes_df[curvibacter_genes_df.description.str.contains("ribosome") | curvibacter_genes_df.description.str.contains("tRNA")]            

Unnamed: 0,locus_tag,gene_id,wp_number,description,GO,GO_process
22,AEP_RS00115,AEP_00023,WP_087493516.1,leucyl/phenylalanyl-tRNA--protein transferase,"GO:0006508,GO:0008914",proteolysis|0006508||IEA
83,AEP_RS00440,AEP_00087,WP_087493572.1,tRNA dihydrouridine(20/20a) synthase DusA,"GO:0002943,GO:0017150,GO:0050660",tRNA dihydrouridine synthesis|0002943||IEA
135,AEP_RS00715,AEP_00140,WP_087493622.1,cysteine--tRNA ligase,"GO:0006423,GO:0004817,GO:0005737",cysteinyl-tRNA aminoacylation|0006423||IEA
138,AEP_RS00730,AEP_00143,WP_087493625.1,tRNA lysidine(34) synthetase TilS,"GO:0006400,GO:0016879",tRNA modification|0006400||IEA
214,AEP_RS01115,AEP_00219,WP_087493689.1,histidine--tRNA ligase,"GO:0006427,GO:0004821,GO:0005737",histidyl-tRNA aminoacylation|0006427||IEA
...,...,...,...,...,...,...
3810,AEP_RS19390,AEP_03887,WP_087496911.1,ribosome maturation factor RimM,"GO:0006364,GO:0003723",rRNA processing|0006364||IEA
3811,AEP_RS19395,AEP_03888,WP_087496912.1,tRNA (guanosine(37)-N1)-methyltransferase TrmD,"GO:0008033,GO:0052906",tRNA processing|0008033||IEA
3815,AEP_RS19415,AEP_03892,WP_232459880.1,ribosome small subunit-dependent GTPase A,"GO:0006412,GO:0005525,GO:0043022",translation|0006412||IEA
3877,AEP_RS19730,unknown,WP_087496970.1,alternative ribosome rescue aminoacyl-tRNA hyd...,"GO:0006415,GO:0003747",translational termination|0006415||IEA


In [9]:
curvibacter_genes_df.to_csv("../../results/curvibacter_genome_annotation.csv")

# Combining Tanscriptomic Data Into One DataFrame

In [10]:
new_cols = ['gene_id','baseMean','log2FoldChange','lfcSE','stat','pvalue','padj']
dataframes = []
for file in files:
    df = pd.read_csv(file)
    df.columns = new_cols
    df.gene_id = df.gene_id.apply(lambda x: x.split("gene:")[1])
    #print(file, len(df))
    filename = file.split("/")[-1]
    filename = filename.split(".")[0]
    df = df[['gene_id','log2FoldChange','padj']]
    cols = ['gene_id','log2FoldChange_'+filename,'padj_'+filename]
    df.columns = cols
    dataframes.append(df)

In [11]:
merged_df = dataframes[0].merge(dataframes[1],on='gene_id', how='outer')
for df in dataframes[2:]:
    merged_df = merged_df.merge(df,on='gene_id',how='outer')

In [12]:
merged_df

Unnamed: 0,gene_id,log2FoldChange_hydra_mono_culture_kiel_vs_liquid_mono_culture_kiel,padj_hydra_mono_culture_kiel_vs_liquid_mono_culture_kiel,log2FoldChange_hydra_mono_culture_kiel_vs_liquid_mono_culture_orgint,padj_hydra_mono_culture_kiel_vs_liquid_mono_culture_orgint,log2FoldChange_hydra_mono_culture_kiel_vs_metatranscriptome,padj_hydra_mono_culture_kiel_vs_metatranscriptome,log2FoldChange_hydra_mono_culture_kiel_vs_plate_mono_culture_kiel,padj_hydra_mono_culture_kiel_vs_plate_mono_culture_kiel,log2FoldChange_liquid_mono_culture_kiel_vs_hydra_mono_culture_kiel,...,log2FoldChange_metatranscriptome_vs_plate_mono_culture_kiel,padj_metatranscriptome_vs_plate_mono_culture_kiel,log2FoldChange_plate_mono_culture_kiel_vs_hydra_mono_culture_kiel,padj_plate_mono_culture_kiel_vs_hydra_mono_culture_kiel,log2FoldChange_plate_mono_culture_kiel_vs_liquid_mono_culture_kiel,padj_plate_mono_culture_kiel_vs_liquid_mono_culture_kiel,log2FoldChange_plate_mono_culture_kiel_vs_liquid_mono_culture_orgint,padj_plate_mono_culture_kiel_vs_liquid_mono_culture_orgint,log2FoldChange_plate_mono_culture_kiel_vs_metatranscriptome,padj_plate_mono_culture_kiel_vs_metatranscriptome
0,AEP_01743,4.373407,5.827452e-55,4.461056,6.116649e-59,1.398843,0.000042,5.154867,1.049175e-70,-4.373407,...,3.756024,2.547303e-39,-5.154867,1.049175e-70,-0.781460,4.838106e-03,-0.693811,1.133548e-02,-3.756024,2.547303e-39
1,AEP_02259,-5.261922,2.357786e-46,-3.001852,4.911117e-13,0.778973,0.236414,-3.181580,3.027295e-17,5.261922,...,-3.960553,4.703554e-27,3.181580,3.027295e-17,-2.080342,6.124982e-13,0.179728,6.493026e-01,3.960553,4.703554e-27
2,AEP_02257,-4.149283,3.392644e-42,-2.852487,2.812422e-17,1.420790,0.001175,-2.523102,6.440080e-16,4.149283,...,-3.943892,3.569554e-37,2.523102,6.440080e-16,-1.626181,3.876025e-13,-0.329385,2.470772e-01,3.943892,3.569554e-37
3,AEP_02254,-5.273761,1.628763e-41,-3.296966,9.684379e-14,1.125705,0.074813,-3.615456,1.045789e-19,5.273761,...,-4.741161,4.029891e-32,3.615456,1.045789e-19,-1.658305,2.152335e-07,0.318490,4.400311e-01,4.741161,4.029891e-32
4,AEP_02913,5.766353,2.054287e-41,6.310431,9.503446e-39,-0.061134,0.953910,5.926282,1.603792e-43,-5.766353,...,5.987416,3.887785e-36,-5.926282,1.603792e-43,-0.159928,7.623927e-01,0.384149,4.893791e-01,-5.987416,3.887785e-36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4089,AEP_02517,0.001443,9.992370e-01,-0.680606,4.187229e-01,-0.193328,0.903380,-0.922715,2.458594e-01,-0.001443,...,-0.729386,4.296084e-02,0.922715,2.458594e-01,0.924158,8.515256e-08,0.242109,1.506058e-01,0.729386,4.296084e-02
4090,AEP_01601,-0.000489,9.994374e-01,0.307626,4.871476e-01,0.119784,0.872550,1.789316,2.110391e-07,0.000489,...,1.669531,1.252692e-07,-1.789316,2.110391e-07,-1.789804,8.447990e-13,-1.481689,2.784427e-07,-1.669531,1.252692e-07
4091,AEP_02226,-0.000765,9.994374e-01,-0.056077,9.489221e-01,-1.329869,0.129158,0.890718,2.046241e-01,0.000765,...,2.220586,3.075194e-15,-0.890718,2.046241e-01,-0.891483,2.465516e-04,-0.946794,1.895991e-04,-2.220586,3.075194e-15
4092,AEP_01375,-0.000436,9.994833e-01,0.011112,9.906639e-01,0.311802,0.830107,0.116953,9.015209e-01,0.000436,...,-0.194848,7.236964e-01,-0.116953,9.015209e-01,-0.117389,7.338954e-01,-0.105842,7.750188e-01,0.194848,7.236964e-01


## Merging Dataframes And Just Mark Up-Down regulated genes

In [13]:
new_cols = ['gene_id','baseMean','log2FoldChange','lfcSE','stat','pvalue','padj']
dataframes = []
for file in files:
    df = pd.read_csv(file)
    df.columns = new_cols
    df.gene_id = df.gene_id.apply(lambda x: x.split("gene:")[1])
    #print(file, len(df))
    df = df[df.padj <= 0.05]
    df.loc[df.log2FoldChange <= -1,'log2FoldChange'] = -1
    df.loc[df.log2FoldChange >= 1,'log2FoldChange'] = 1
    df.loc[(df.log2FoldChange > -1) & (df.log2FoldChange < 1),'log2FoldChange'] = 0
    filename = file.split("/")[-1]
    filename = filename.split(".")[0]
    df = df[['gene_id','log2FoldChange']]
    cols = ['gene_id','log2FoldChange_'+filename]
    df.columns = cols
    dataframes.append(df)

In [14]:
merged_df = curvibacter_genes_df.merge(dataframes[0],on='gene_id',how='outer')
for df in dataframes[1:]:
    merged_df = merged_df.merge(df,on='gene_id',how='outer')

In [15]:
merged_df = merged_df.fillna(0)

In [16]:
merged_df.to_csv("../../results/curvibacter_full_diff_table_rsem.csv")

In [17]:
merged_df.to_excel("../../results/curvibacter_full_diff_excel_rsem.xlsx")

In [18]:
np.array(merged_df[['log2FoldChange_hydra_mono_culture_kiel_vs_plate_mono_culture_kiel',
       'log2FoldChange_plate_mono_culture_kiel_vs_liquid_mono_culture_orgint',
       'log2FoldChange_plate_mono_culture_kiel_vs_liquid_mono_culture_kiel',
       'log2FoldChange_hydra_mono_culture_kiel_vs_liquid_mono_culture_orgint',
       'log2FoldChange_metatranscriptome_vs_liquid_mono_culture_orgint',
       'log2FoldChange_liquid_mono_culture_orgint_vs_metatranscriptome',
       'log2FoldChange_hydra_mono_culture_kiel_vs_metatranscriptome',
       'log2FoldChange_plate_mono_culture_kiel_vs_hydra_mono_culture_kiel',
       'log2FoldChange_metatranscriptome_vs_liquid_mono_culture_kiel',
       'log2FoldChange_plate_mono_culture_kiel_vs_metatranscriptome',
       'log2FoldChange_liquid_mono_culture_kiel_vs_liquid_mono_culture_orgint',
       'log2FoldChange_liquid_mono_culture_kiel_vs_plate_mono_culture_kiel',
       'log2FoldChange_metatranscriptome_vs_plate_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_kiel_vs_hydra_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_orgint_vs_plate_mono_culture_kiel',
       'log2FoldChange_hydra_mono_culture_kiel_vs_liquid_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_orgint_vs_liquid_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_kiel_vs_metatranscriptome',
       'log2FoldChange_metatranscriptome_vs_hydra_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_orgint_vs_hydra_mono_culture_kiel']])

array([[-1.,  1.,  0., ...,  1., -1.,  0.],
       [ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])