In [1]:
# Import packages 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import statsmodels.stats.multitest as smm

# 1 Import Data

In [12]:
# Import DEG data 
deg_na = pd.read_csv('11_identify_DEG/deg_na.csv')
deg_nr = pd.read_csv('11_identify_DEG/deg_nr.csv')
deg_ar = pd.read_csv('11_identify_DEG/deg_ar.csv')

deg={'deg_na': deg_na, 'deg_nr': deg_nr, 'deg_ar': deg_ar}

# Import GO data 
# Geneid to GO term
goterm = pd.read_csv('14_functional_annotation/ccar2swissprotGO_long.txt', sep='\t') 
# GO term to description
godesc = pd.read_csv('14_functional_annotation/go_data.txt', sep='\t')
godesc.set_index('id', inplace=True)


In [7]:
deg_na

Unnamed: 0.1,Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,regulation,condition
0,ccar_ub01-g23276,435.231278,1.199775,0.195821,4.193960,2.741266e-05,1.632904e-03,up,Normoxia to Anoxia
1,ccar_ub01-g23394,1608.129409,0.702181,0.089437,3.618972,2.957758e-04,1.205587e-02,up,Normoxia to Anoxia
2,ccar_ub01-g23440,439.783173,1.514115,0.169605,6.695587,2.148082e-11,4.912875e-09,up,Normoxia to Anoxia
3,ccar_ub01-g23466,381.122909,1.140354,0.187884,4.054848,5.016682e-05,2.741491e-03,up,Normoxia to Anoxia
4,ccar_ub01-g23597,260.407447,1.823130,0.208857,6.916779,4.620292e-12,1.172510e-09,up,Normoxia to Anoxia
...,...,...,...,...,...,...,...,...,...
1260,ccar_ub25-g45323,243.975530,-0.930533,0.171984,-3.209732,1.328590e-03,4.068231e-02,up,Normoxia to Anoxia
1261,ccar_ub25-g45369,1307.334970,-1.050800,0.150805,-4.458009,8.272431e-06,5.750504e-04,up,Normoxia to Anoxia
1262,ccar_ub25-g45370,450.518438,-1.353741,0.175860,-5.545470,2.931661e-08,3.935543e-06,up,Normoxia to Anoxia
1263,scaffold_44-g45718,350.751116,-1.286276,0.143563,-6.323114,2.563447e-10,5.025305e-08,up,Normoxia to Anoxia


# 2 GO go_na Analysis Output 

In [13]:
go_files = sorted(Path("14_functional_annotation").glob('deg_*_GOterms.csv'))

# total number of genes in the background dataset 
total_num_genes = pd.read_csv('12_gene_length_distribution/total_number_of_genes.csv', sep=',')

# create empty dict for df's
go_enriched = []


for f in go_files: 
    go_terms = pd.read_csv(f, sep=',')
    # strip emtpy space around GO terms 
    go_terms['category'] = go_terms['category'].str.strip()
    # since the term and ontology column are empty, I filled them with information from the godesc dataframe
    go_terms['term'] = godesc.loc[go_terms['category'], 'name'].to_numpy()
    go_terms['ontology'] = godesc.loc[go_terms['category'], 'namespace'].to_numpy()
    # calculate expected number in each category 
    go_terms['expectedDEInCat'] = (go_terms['numInCat']/len(total_num_genes))*len(deg[f.stem[:-8]])
    # calculate fold enrichment
    go_terms['foldEnrichment'] = (go_terms['numDEInCat']/go_terms['expectedDEInCat'])
    # calculate negative fold change from the FDR corrected values
    go_terms['-log10(FDR)'] = -np.log10(go_terms['over_represented_pvalue'])
    go_terms['padj'] = smm.multipletests(go_terms['over_represented_pvalue'], method='bonferroni')[1]
    go_terms = go_terms.sort_values(by='padj')
    go_terms = go_terms[go_terms['padj'] <= 0.05]#.iloc[-10:]
    if 'na' in f.name: 
        go_terms['comparison'] = 'Normoxia to Anoxia'
    elif 'nr' in f.name:
        go_terms['comparison'] = 'Normoxia to Reoxygenation'
    elif 'ar' in f.name:
        go_terms['comparison'] = 'Anoxia to Reoxygenation'
    
    # save to list 
    go_enriched.append(go_terms)
# merge the dataframes and add column with name of comparison
go = pd.concat(go_enriched)
go['logpadj'] = np.log10(go['padj'])



In [14]:
len(deg_ar)

462

# 3 Plot

In [15]:
go

Unnamed: 0,category,over_represented_pvalue,under_represented_pvalue,numDEInCat,numInCat,term,ontology,expectedDEInCat,foldEnrichment,-log10(FDR),padj,comparison,logpadj
1,GO:0050767,9.082899e-15,1.0,16,103,regulation of neurogenesis,BP,1.042022,15.354768,14.041776,1.737104e-10,Anoxia to Reoxygenation,-9.760174
2,GO:0072282,1.255152e-13,1.0,8,13,metanephric nephron tubule morphogenesis,BP,0.131517,60.828505,12.901304,2.400478e-09,Anoxia to Reoxygenation,-8.619702
3,GO:2000978,1.255152e-13,1.0,8,13,negative regulation of forebrain neuron differ...,BP,0.131517,60.828505,12.901304,2.400478e-09,Anoxia to Reoxygenation,-8.619702
4,GO:0072086,2.893861e-13,1.0,8,14,specification of loop of Henle identity,BP,0.141634,56.483612,12.538522,5.534509e-09,Anoxia to Reoxygenation,-8.256921
5,GO:0045608,6.183131e-13,1.0,8,15,negative regulation of inner ear auditory rece...,BP,0.151751,52.718038,12.208792,1.182524e-08,Anoxia to Reoxygenation,-7.927190
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,GO:0048514,1.620713e-06,1.0,9,85,blood vessel morphogenesis,BP,1.079554,8.336775,5.790294,3.099615e-02,Normoxia to Reoxygenation,-1.508692
23,GO:0072049,2.026044e-06,1.0,5,17,comma-shaped body morphogenesis,BP,0.215911,23.157708,5.693351,3.874809e-02,Normoxia to Reoxygenation,-1.411750
24,GO:0005354,2.098545e-06,1.0,3,3,galactose transmembrane transporter activity,MF,0.038102,78.736207,5.678082,4.013467e-02,Normoxia to Reoxygenation,-1.396480
25,GO:0021781,2.197018e-06,1.0,5,17,glial cell fate commitment,BP,0.215911,23.157708,5.658166,4.201798e-02,Normoxia to Reoxygenation,-1.376565


In [None]:
# create a Facetgrid 3x3
g = sns.FacetGrid(go, col='comparison', row = 'ontology', sharey=False, sharex = False)
g.map_dataframe(sns.scatterplot, 'foldEnrichment','term', size= 'numDEInCat', size_norm=(0, 1))
g.add_legend(title='')




In [None]:


g.set_titles(col_template="{col_name}")
g.map_dataframe(add_deg_number)

# save as png
g.savefig('11_identify_DEG/vulcano_plot_DEG.png', dpi=350, bbox_inches='tight')