In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
deseq2_table_path = "../../results/rsem/"
files = [] 
for file in os.listdir(deseq2_table_path):
    if file.endswith(".csv"):
        files.append(deseq2_table_path + file)

# Curvibacter genome file parsing for WP to AEP conversion and transcript descriptions

In [None]:
gff = pd.read_table("../../data/curvibacter_genome/GCF_002163715.1_ASM216371v1_genomic.gff", delimiter="\t", skiprows=9, header=None)

In [None]:
gff.head()

In [None]:
columns = ["seqname","source","feature","start","end","score","strand","frame","attribute"]
gff.columns = columns
gff = gff.dropna()
gff.head()

In [None]:
# fill lists with genomic information
locus_tag = []
wp_number = []
description = []
ontology_terms = []
ontology_process = []
for feature,transcript in zip(gff.feature,gff.attribute):
    if feature == 'CDS':
        tmp = transcript.split(";")
        switch_go = 0
        switch_process = 0
        for att in tmp:

            if att.startswith("ID"):
                wp_number.append(att.split("cds-")[1])
            elif att.startswith("Parent"):
                locus_tag.append(att.split("=gene-")[1])
            elif att.startswith("product="):
                description.append(att.split("=")[1])
            
            elif att.startswith("Ontology_term="):
                ontology_terms.append(att.split("Ontology_term=")[1])
                switch_go = 1
            elif att.startswith("go_process="):
                ontology_process.append(att.split("go_process=")[1])
                switch_process = 1
                
        if switch_go == 0:
            ontology_terms.append("unknown")
        if switch_process == 0:
            ontology_process.append("unknown")

            
old_locus_tag = []
for tag in locus_tag:
    tag_switch = 0
    for feature,transcript in zip(gff.feature,gff.attribute):
        if feature == 'gene':
            switch = 0
            tmp = transcript.split(";")
            for att in tmp:
                if att.startswith('locus_tag='):
                    tmp_tag = att.split("locus_tag=")[1]
                    if tmp_tag == tag:
                        switch = 1
                        
            if switch == 1:
                for att in tmp:
                    if att.startswith('old_locus_tag'):
                        tmp_tag = att.split("old_locus_tag=")[1]
                        old_locus_tag.append(tmp_tag)
                        tag_switch = 1
    if tag_switch == 0:
        old_locus_tag.append("unknown")

In [None]:
data_columns = [locus_tag,old_locus_tag,wp_number,description,ontology_terms,ontology_process]
curvibacter_genes_df = pd.DataFrame(data_columns).transpose()
curvibacter_genes_df.columns = ['locus_tag','gene_id','wp_number','description','GO','GO_process']
curvibacter_genes_df.head()

In [None]:
curvibacter_genes_df[curvibacter_genes_df.description.str.contains("ribosome") | curvibacter_genes_df.description.str.contains("tRNA")]            

In [None]:
curvibacter_genes_df.to_csv("../../results/curvibacter_genome_annotation.csv")

# Combining Tanscriptomic Data Into One DataFrame

In [None]:
new_cols = ['gene_id','baseMean','log2FoldChange','lfcSE','stat','pvalue','padj']
dataframes = []
for file in files:
    df = pd.read_csv(file)
    df.columns = new_cols
    df.gene_id = df.gene_id.apply(lambda x: x.split("gene:")[1])
    #print(file, len(df))
    filename = file.split("/")[-1]
    filename = filename.split(".")[0]
    df = df[['gene_id','log2FoldChange','padj']]
    cols = ['gene_id','log2FoldChange_'+filename,'padj_'+filename]
    df.columns = cols
    dataframes.append(df)

In [None]:
merged_df = dataframes[0].merge(dataframes[1],on='gene_id', how='outer')
for df in dataframes[2:]:
    merged_df = merged_df.merge(df,on='gene_id',how='outer')

In [None]:
merged_df

## Merging Dataframes And Just Mark Up-Down regulated genes

In [None]:
new_cols = ['gene_id','baseMean','log2FoldChange','lfcSE','stat','pvalue','padj']
dataframes = []
for file in files:
    df = pd.read_csv(file)
    df.columns = new_cols
    df.gene_id = df.gene_id.apply(lambda x: x.split("gene:")[1])
    #print(file, len(df))
    df = df[df.padj <= 0.05]
    df.loc[df.log2FoldChange <= -1,'log2FoldChange'] = -1
    df.loc[df.log2FoldChange >= 1,'log2FoldChange'] = 1
    df.loc[(df.log2FoldChange > -1) & (df.log2FoldChange < 1),'log2FoldChange'] = 0
    filename = file.split("/")[-1]
    filename = filename.split(".")[0]
    df = df[['gene_id','log2FoldChange']]
    cols = ['gene_id','log2FoldChange_'+filename]
    df.columns = cols
    dataframes.append(df)

In [None]:
merged_df = curvibacter_genes_df.merge(dataframes[0],on='gene_id',how='outer')
for df in dataframes[1:]:
    merged_df = merged_df.merge(df,on='gene_id',how='outer')

In [None]:
merged_df = merged_df.fillna(0)

In [None]:
merged_df.to_csv("../../results/curvibacter_full_diff_table_rsem.csv")

In [None]:
merged_df.to_excel("../../results/curvibacter_full_diff_excel_rsem.xlsx")

In [None]:
np.array(merged_df[['log2FoldChange_hydra_mono_culture_kiel_vs_plate_mono_culture_kiel',
       'log2FoldChange_plate_mono_culture_kiel_vs_liquid_mono_culture_orgint',
       'log2FoldChange_plate_mono_culture_kiel_vs_liquid_mono_culture_kiel',
       'log2FoldChange_hydra_mono_culture_kiel_vs_liquid_mono_culture_orgint',
       'log2FoldChange_metatranscriptome_vs_liquid_mono_culture_orgint',
       'log2FoldChange_liquid_mono_culture_orgint_vs_metatranscriptome',
       'log2FoldChange_hydra_mono_culture_kiel_vs_metatranscriptome',
       'log2FoldChange_plate_mono_culture_kiel_vs_hydra_mono_culture_kiel',
       'log2FoldChange_metatranscriptome_vs_liquid_mono_culture_kiel',
       'log2FoldChange_plate_mono_culture_kiel_vs_metatranscriptome',
       'log2FoldChange_liquid_mono_culture_kiel_vs_liquid_mono_culture_orgint',
       'log2FoldChange_liquid_mono_culture_kiel_vs_plate_mono_culture_kiel',
       'log2FoldChange_metatranscriptome_vs_plate_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_kiel_vs_hydra_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_orgint_vs_plate_mono_culture_kiel',
       'log2FoldChange_hydra_mono_culture_kiel_vs_liquid_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_orgint_vs_liquid_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_kiel_vs_metatranscriptome',
       'log2FoldChange_metatranscriptome_vs_hydra_mono_culture_kiel',
       'log2FoldChange_liquid_mono_culture_orgint_vs_hydra_mono_culture_kiel']])