In [1]:
# Import packages 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import statsmodels.stats.multitest as smm
import json

In [2]:
# Import GO data 
# Geneid to GO term
goterm = pd.read_csv('11_GO_enrichment_analysis/ccar2swissprotGO_long.txt', sep='\t') 
# GO term to description
godesc = pd.read_csv('11_GO_enrichment_analysis/go_data.txt', sep='\t')
godesc.set_index('id', inplace=True)

In [3]:
# import DMR (for calculating the expectedDEInCat value) 
# I need the length of the DMRs 
with open(f'/home/maggy/WholeGenomeBisulphiteSequencing/GO_analysis/1_prepare_data_for_GO_analysis/dmr_unique_geneids.json', 'r') as file:
    dmr_geneids = json.load(file)

In [4]:
go_files = sorted(Path("11_GO_enrichment_analysis").glob('*_GOterms.csv'))

# total number of genes in the background dataset 
total_num_genes = pd.read_csv('/home/maggy/WholeGenomeBisulphiteSequencing/RNAseq_pipeline/12_gene_length_distribution/total_number_of_genes.csv', sep=',')

# create empty dict for df's
go_enriched = []


for f in go_files: 
    go_terms = pd.read_csv(f, sep=',')
    # strip emtpy space around GO terms 
    go_terms['category'] = go_terms['category'].str.strip()
    # since the term and ontology column are empty, I filled them with information from the godesc dataframe
    go_terms['term'] = godesc.loc[go_terms['category'], 'name'].to_numpy()
    go_terms['ontology'] = godesc.loc[go_terms['category'], 'namespace'].to_numpy()
    # calculate expected number in each category 
    go_terms['expectedDEInCat'] = (go_terms['numInCat']/len(total_num_genes))*len(dmr_geneids[f.stem[:4]])
    # calculate fold enrichment
    go_terms['foldEnrichment'] = (go_terms['numDEInCat']/go_terms['expectedDEInCat'])
    # calculate negative fold change from the FDR corrected values
    go_terms['-log10(FDR)'] = -np.log10(go_terms['over_represented_pvalue'])
    go_terms['padj'] = smm.multipletests(go_terms['over_represented_pvalue'], method='bonferroni')[1]
    go_terms = go_terms.sort_values(by='padj')
    #go_terms = go_terms[go_terms['padj'] <= 0.05]#.iloc[-10:]
    if 'NvsA' in f.name: 
        go_terms['comparison'] = 'Normoxia to Anoxia'
    elif 'NvsR' in f.name:
        go_terms['comparison'] = 'Normoxia to Reoxygenation'
    elif 'AvsR' in f.name:
        go_terms['comparison'] = 'Anoxia to Reoxygenation'
    
    # save to list 
    go_enriched.append(go_terms)
# merge the dataframes and add column with name of comparison
go = pd.concat(go_enriched)
go['logpadj'] = np.log10(go['padj'])

In [10]:
dmr_test = go.loc[go['over_represented_pvalue']<= 0.05]

In [11]:
dmr_test

Unnamed: 0,category,over_represented_pvalue,under_represented_pvalue,numDEInCat,numInCat,term,ontology,expectedDEInCat,foldEnrichment,-log10(FDR),padj,comparison,logpadj
1,GO:0036042,0.000632,1.000000,1,2,long-chain fatty acyl-CoA binding,MF,0.000657,1522.233333,3.199491,1.0,Anoxia to Reoxygenation,0.0
53,GO:0120293,0.009760,0.999957,1,33,dynein axonemal particle,CC,0.010839,92.256566,2.010551,1.0,Anoxia to Reoxygenation,0.0
52,GO:0021954,0.009663,0.999958,1,32,central nervous system neuron development,BP,0.010511,95.139583,2.014872,1.0,Anoxia to Reoxygenation,0.0
51,GO:0016242,0.009480,0.999960,1,30,negative regulation of macroautophagy,BP,0.009854,101.482222,2.023179,1.0,Anoxia to Reoxygenation,0.0
50,GO:0007155,0.009372,0.999960,1,30,cell adhesion,BP,0.009854,101.482222,2.028177,1.0,Anoxia to Reoxygenation,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,GO:0070034,0.016985,0.999865,1,29,telomerase RNA binding,MF,0.017781,56.240148,1.769943,1.0,Normoxia to Reoxygenation,0.0
98,GO:0007141,0.016978,0.999865,1,29,male meiosis I,BP,0.017781,56.240148,1.770111,1.0,Normoxia to Reoxygenation,0.0
97,GO:0019725,0.016439,0.999874,1,28,cellular homeostasis,BP,0.017168,58.248724,1.784128,1.0,Normoxia to Reoxygenation,0.0
96,GO:0043069,0.016427,0.999874,1,28,negative regulation of programmed cell death,BP,0.017168,58.248724,1.784441,1.0,Normoxia to Reoxygenation,0.0


In [6]:
dmr_test_na = dmr_test.loc[dmr_test['comparison'] == 'Normoxia to Anoxia']	
dmr_test_nr = dmr_test.loc[dmr_test['comparison'] == 'Normoxia to Reoxygenation']
dmr_test_ar = dmr_test.loc[dmr_test['comparison'] == 'Anoxia to Reoxygenation']

# select only two columns
dmr_test_na = dmr_test_na[['category', 'over_represented_pvalue']]
dmr_test_nr = dmr_test_nr[['category', 'over_represented_pvalue']]
dmr_test_ar = dmr_test_ar[['category', 'over_represented_pvalue']]

In [7]:
dmr_test_na.to_csv('11_GO_enrichment_analysis/2_GO_NvsA.csv', index=False, sep='\t')
dmr_test_nr.to_csv('11_GO_enrichment_analysis/2_GO_NvsR.csv', index=False, sep='\t')
dmr_test_ar.to_csv('11_GO_enrichment_analysis/2_GO_AvsR.csv', index=False, sep='\t')