In [71]:
import pandas as pd
import numpy as np
from os.path import isfile
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import textwrap

In [10]:
result_path = "../../results/gene_enrichment_analysis/goatools/"
data_path = "../../results/rsem/"
samples_path = "../../data/rsem/samples.csv"
samples_dataframe = pd.read_csv(samples_path)

# Preparation of input files for goatools

In [11]:
dataframes = []
for unique_sample in samples_dataframe.treatment.unique():
    for unique_sample_2 in samples_dataframe.treatment.unique():
        if unique_sample != unique_sample_2:
            file = data_path + unique_sample + "_vs_" + unique_sample_2 + ".CSV"
            if isfile(file):
                dataframes.append(file)
dataframes

['../../results/rsem/liquid_mono_culture_orgint_vs_metatranscriptome.CSV',
 '../../results/rsem/liquid_mono_culture_orgint_vs_plate_mono_culture_kiel.CSV',
 '../../results/rsem/liquid_mono_culture_orgint_vs_liquid_mono_culture_kiel.CSV',
 '../../results/rsem/liquid_mono_culture_orgint_vs_hydra_mono_culture_kiel.CSV',
 '../../results/rsem/metatranscriptome_vs_liquid_mono_culture_orgint.CSV',
 '../../results/rsem/metatranscriptome_vs_plate_mono_culture_kiel.CSV',
 '../../results/rsem/metatranscriptome_vs_liquid_mono_culture_kiel.CSV',
 '../../results/rsem/metatranscriptome_vs_hydra_mono_culture_kiel.CSV',
 '../../results/rsem/plate_mono_culture_kiel_vs_liquid_mono_culture_orgint.CSV',
 '../../results/rsem/plate_mono_culture_kiel_vs_metatranscriptome.CSV',
 '../../results/rsem/plate_mono_culture_kiel_vs_liquid_mono_culture_kiel.CSV',
 '../../results/rsem/plate_mono_culture_kiel_vs_hydra_mono_culture_kiel.CSV',
 '../../results/rsem/liquid_mono_culture_kiel_vs_liquid_mono_culture_orgint.CSV

In [15]:
curvibacter_genes_df = pd.read_csv("../../results/curvibacter_genome_annotation.csv")
curvibacter_genes_df["gene_id"] = curvibacter_genes_df["gene_id"].apply(lambda x: "gene:"+x)
curvibacter_genes_df.head()

Unnamed: 0.1,Unnamed: 0,locus_tag,gene_id,wp_number,description,GO,GO_process
0,0,AEP_RS00005,gene:AEP_00001,WP_087493495.1,response regulator transcription factor,"GO:0000160,GO:0006355,GO:0003677",phosphorelay signal transduction system|000016...
1,1,AEP_RS00010,gene:AEP_00002,WP_087493496.1,sodium-translocating pyrophosphatase,"GO:1902600,GO:0009678",proton transmembrane transport|1902600||IEA
2,2,AEP_RS00015,gene:AEP_00003,WP_087493497.1,inorganic diphosphatase,"GO:0006796,GO:0004427",phosphate-containing compound metabolic proces...
3,3,AEP_RS00020,gene:AEP_00004,WP_087493498.1,alpha/beta fold hydrolase,unknown,unknown
4,4,AEP_RS00025,gene:AEP_00005,WP_087493499.1,chemotaxis protein CheW,"GO:0006935,GO:0007165","chemotaxis|0006935||IEA,signal transduction|00..."


In [34]:
# writing population and associations file 
go_gene_ids = []
with open(result_path + "associations.txt", "w") as associations_file:
    with open(result_path + "populations.txt", "w") as populations_file:
        for gene_id in curvibacter_genes_df["gene_id"]:
            if curvibacter_genes_df[curvibacter_genes_df["gene_id"] == gene_id].GO.values[0] != "unknown":
                
                go_gene_ids.append(gene_id)
                populations_file.write(gene_id+"\n")
                associations_file.write(gene_id + "\t")
                counter = 0
                gos = curvibacter_genes_df[curvibacter_genes_df["gene_id"] == gene_id].GO.values[0].split(",")
                for go_id in gos:
                    
                    if counter != len(gos)-1:
                        associations_file.write(go_id+";")
                    else:
                        associations_file.write(go_id+"\n")
                    counter += 1

In [57]:
# writing sample files for up/down regulated genes
go_files = []
for df in dataframes:
    print("[+] Working with: {}".format(df))
    log2folddf = pd.read_csv(df)
    log2folddf.columns = ["gene_id","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj"]
    log2folddf = log2folddf[log2folddf["padj"] <= 0.05]
    downregulated_genes = log2folddf[log2folddf["log2FoldChange"] <= -1.0]
    upregulated_genes = log2folddf[log2folddf["log2FoldChange"] >= 1.0]

    downregulated_genes = downregulated_genes[downregulated_genes["gene_id"].isin(go_gene_ids)]
    upregulated_genes = upregulated_genes[upregulated_genes["gene_id"].isin(go_gene_ids)]
    
    
    print("\t[*] Length of downregulated genes: {}".format(len(downregulated_genes)))
    print("\t[*] Length of upregulated genes: {}".format(len(upregulated_genes)))
    
    sample = df.split("/")[-1].split(".CSV")[0] 
    sample_up = result_path + sample + "_upregulated_genes_goatools.txt"
    sample_down = result_path + sample + "_downregulated_genes_goatools.txt"
    
    if len(downregulated_genes) > 5:
        go_files.append(sample_down)
        with open(sample_down, "w") as goadown:
            for gene_id in downregulated_genes["gene_id"]:
                goadown.write(gene_id+"\n")
    if len(upregulated_genes) > 5:
        go_files.append(sample_up)
        with open(sample_up, "w") as goaup:
            for gene_id in upregulated_genes["gene_id"]:
                goaup.write(gene_id+"\n")

[+] Working with: ../../results/rsem/liquid_mono_culture_orgint_vs_metatranscriptome.CSV
	[*] Length of downregulated genes: 325
	[*] Length of upregulated genes: 344
[+] Working with: ../../results/rsem/liquid_mono_culture_orgint_vs_plate_mono_culture_kiel.CSV
	[*] Length of downregulated genes: 344
	[*] Length of upregulated genes: 331
[+] Working with: ../../results/rsem/liquid_mono_culture_orgint_vs_liquid_mono_culture_kiel.CSV
	[*] Length of downregulated genes: 221
	[*] Length of upregulated genes: 270
[+] Working with: ../../results/rsem/liquid_mono_culture_orgint_vs_hydra_mono_culture_kiel.CSV
	[*] Length of downregulated genes: 340
	[*] Length of upregulated genes: 531
[+] Working with: ../../results/rsem/metatranscriptome_vs_liquid_mono_culture_orgint.CSV
	[*] Length of downregulated genes: 344
	[*] Length of upregulated genes: 325
[+] Working with: ../../results/rsem/metatranscriptome_vs_plate_mono_culture_kiel.CSV
	[*] Length of downregulated genes: 501
	[*] Length of upreg

In [73]:
goafiles = []
for samplefile in go_files:
    print("[+] Working with {}".format(samplefile))
    outfile = result_path + samplefile.split("/")[-1].split(".txt")[0] + "_output.table"
    
    !find_enrichment.py $samplefile ../../results/gene_enrichment_analysis/goatools/populations.txt ../../results/gene_enrichment_analysis/goatools/associations.txt --annofmt id2gos --alpha 0.05 --pval 0.05 --obo ../../results/gene_enrichment_analysis/goatools/go-basic.obo --method fdr_bh --outfile $outfile --obsolete replace > /dev/null
    
    goafiles.append(outfile)
    print("[*] DONE")

[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_orgint_vs_metatranscriptome_downregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_orgint_vs_metatranscriptome_upregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_orgint_vs_plate_mono_culture_kiel_downregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_orgint_vs_plate_mono_culture_kiel_upregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_orgint_vs_liquid_mono_culture_kiel_downregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_orgint_vs_liquid_mono_culture_kiel_upregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_ana

[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_kiel_vs_liquid_mono_culture_orgint_downregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_kiel_vs_liquid_mono_culture_orgint_upregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_kiel_vs_metatranscriptome_downregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_kiel_vs_metatranscriptome_upregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_kiel_vs_plate_mono_culture_kiel_downregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_analysis/goatools/liquid_mono_culture_kiel_vs_plate_mono_culture_kiel_upregulated_genes_goatools.txt
[*] DONE
[+] Working with ../../results/gene_enrichment_an

In [77]:
def plot_goa(goafile_enriched:pd.DataFrame,savep:str, filename:str):
    print("[*] Producing plot for {}".format(filename))
    goafile_enriched["ratio_stud"] = goafile_enriched.ratio_in_study.apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]))
    goafile_enriched["ratio_pop"] = goafile_enriched.ratio_in_pop.apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]))
    goafile_enriched["amount_in_pop"] = goafile_enriched.ratio_in_pop.apply(lambda x: int(x.split("/")[0]))

    categories = []
    for cat in list(goafile_enriched.name):
        if len(cat) >= 30:
            cat = textwrap.fill(cat, width=30)
        categories.append(cat)
        
    values = list(goafile_enriched.study_count)
    scatter_values = np.array(goafile_enriched.study_count) / np.array(goafile_enriched.amount_in_pop)
    
    pcolors = goafile_enriched.p_fdr_bh
    norm_p_values = np.array(pcolors) / max(pcolors)
    colors=plt.cm.RdBu_r(norm_p_values)

    # Create figure and axes
    if len(goafile_enriched) == 30:
        fsize = (20,18)
    elif len(goafile_enriched) >= 15:
        fsize = (16,12)
    else:
        fsize = (12,8)
        
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 16), sharey=True)

    # Plot horizontal bar plot on ax1
    ax1.barh(categories, values, color=colors, edgecolor="black")

    ax1.set_xlabel('Count')

    ax2.scatter(scatter_values, categories, c=colors, cmap='RdBu_r', 
                label='Gene Ratio (compared to Study)', s=list(goafile_enriched.ratio_stud*1000),edgecolor="black")

    ax2.set_xlabel('Count in Study / Count in Pop')
    ax1.set_ylabel('GO Categories')
    ax1.invert_yaxis()
    plt.subplots_adjust(left=0.2, wspace=0.1)
    cbar = fig.colorbar(ScalarMappable(cmap='RdBu_r'), ax=[ax1, ax2], pad = 0.005)
    cbar.set_label('P-values')
    cbar.set_ticks([min(norm_p_values), max(norm_p_values)])
    cbar.set_ticklabels([f'{min(goafile_enriched.p_fdr_bh):.4f}', f'{max(goafile_enriched.p_fdr_bh):.4f}'])

    cbar.ax.set_position([0.85, 0.15, 0.03, 0.7])

    plt.savefig(savep + filename + ".jpg", dpi=400)
    plt.close()
    print("[*] DONE")

In [78]:
for goafile in goafiles:
    if isfile(goafile):
        goafigure = goafile.split("/")[-1].split("_goatools_output.table")[0]
        dataframe = pd.read_table(goafile)
        plot_goa(dataframe, result_path, goafigure)

[*] Producing plot for liquid_mono_culture_orgint_vs_metatranscriptome_downregulated_genes
[*] DONE
[*] Producing plot for liquid_mono_culture_orgint_vs_metatranscriptome_upregulated_genes
[*] DONE
[*] Producing plot for liquid_mono_culture_orgint_vs_plate_mono_culture_kiel_downregulated_genes
[*] DONE
[*] Producing plot for liquid_mono_culture_orgint_vs_plate_mono_culture_kiel_upregulated_genes
[*] DONE
[*] Producing plot for liquid_mono_culture_orgint_vs_liquid_mono_culture_kiel_upregulated_genes
[*] DONE
[*] Producing plot for liquid_mono_culture_orgint_vs_hydra_mono_culture_kiel_downregulated_genes
[*] DONE
[*] Producing plot for liquid_mono_culture_orgint_vs_hydra_mono_culture_kiel_upregulated_genes
[*] DONE
[*] Producing plot for metatranscriptome_vs_liquid_mono_culture_orgint_downregulated_genes
[*] DONE
[*] Producing plot for metatranscriptome_vs_liquid_mono_culture_orgint_upregulated_genes
[*] DONE
[*] Producing plot for metatranscriptome_vs_plate_mono_culture_kiel_downregulat