In [1]:
import pandas as pd
import numpy as np
import os

In [4]:
DATASET = os.path.join("..","data","Chennai_data")
MAG = os.path.join(DATASET, "MAG_enrichment_analysis")
ENRICHMENT = os.path.join("..","results","MAG_enrichment_analysis")
os.makedirs(ENRICHMENT, exist_ok = True)

In [5]:
Cutibacterium_acnes_df = pd.read_csv(os.path.join(MAG, "COG_function_enrichment_Cutibacterium_acnes.txt"), sep ="\t")
Stutzerimonas_stutzeri_df = pd.read_csv(os.path.join(MAG, "COG_function_enrichment.txt"), sep ="\t")
Cutibacterium_acnes_df.head()

Unnamed: 0,COG20_FUNCTION,enrichment_score,unadjusted_p_value,adjusted_q_value,associated_groups,accession,gene_clusters_ids,p_Reference,p_Chennai,N_Reference,N_Chennai
0,Alanine dehydrogenase (includes sporulation pr...,44.10896,3.10596e-11,4.149562e-08,Reference,COG0686,GC_00000037,1.0,0.1667,39,18
1,Ribosomal protein L36 (RpmJ) (PDB:1DFE) (PUBME...,40.209305,2.281589e-10,1.524102e-07,Chennai,COG0257,"GC_00002317, GC_00002387",0.0,0.7778,39,18
2,Translation initiation factor IF-1 (InfA) (PDB...,36.488698,1.53554e-09,4.102964e-07,Chennai,COG0361,GC_00002259,0.0,0.7222,39,18
3,Beta-glucosidase/6-phospho-beta-glucosidase/be...,36.488698,1.53554e-09,4.102964e-07,Reference,COG2723,GC_00002100,1.0,0.2778,39,18
4,Phosphotransferase system cellobiose-specific ...,36.488698,1.53554e-09,4.102964e-07,Reference,COG1455,GC_00002117,1.0,0.2778,39,18


In [6]:
def enrichment_analysis(df, min_Log2FC):
    df["pseudo_p_Reference"] = ((df["N_Reference"]*df["p_Reference"])+ 1)/(df["N_Reference"] + 1)
    df["pseudo_p_Chennai"] = ((df["N_Chennai"]*df["p_Chennai"])+ 1)/(df["N_Chennai"] + 1)
    df["log2_Reference"] = np.log2(df["pseudo_p_Reference"])
    df["log2_Chennai"] = np.log2(df["pseudo_p_Chennai"])
    df["Log2FC"] = df["log2_Chennai"] - df["log2_Reference"]
    df = df.query("adjusted_q_value < 0.05 and Log2FC > @min_Log2FC")
    return df

In [7]:
Cutibacterium_acnes_df = enrichment_analysis(Cutibacterium_acnes_df, min_Log2FC = 2.97)
Stutzerimonas_stutzeri_df = enrichment_analysis(Stutzerimonas_stutzeri_df, min_Log2FC = 2.97)
Cutibacterium_acnes_df.to_csv(os.path.join(ENRICHMENT, "Table_2_Cutibacterium_acnes.csv"), index = False)
Stutzerimonas_stutzeri_df.to_csv(os.path.join(ENRICHMENT, "Table_3_Stutzerimonas_stutzeri.csv"), index = False)


In [8]:
Cutibacterium_acnes_df.head()

Unnamed: 0,COG20_FUNCTION,enrichment_score,unadjusted_p_value,adjusted_q_value,associated_groups,accession,gene_clusters_ids,p_Reference,p_Chennai,N_Reference,N_Chennai,pseudo_p_Reference,pseudo_p_Chennai,log2_Reference,log2_Chennai,Log2FC
1,Ribosomal protein L36 (RpmJ) (PDB:1DFE) (PUBME...,40.209305,2.281589e-10,1.524102e-07,Chennai,COG0257,"GC_00002317, GC_00002387",0.0,0.7778,39,18,0.025,0.789495,-5.321928,-0.340998,4.98093
2,Translation initiation factor IF-1 (InfA) (PDB...,36.488698,1.53554e-09,4.102964e-07,Chennai,COG0361,GC_00002259,0.0,0.7222,39,18,0.025,0.736821,-5.321928,-0.440614,4.881314
5,Transposase InsO and inactivated derivatives (...,35.930653,2.044666e-09,4.552789e-07,Chennai,COG2801,"GC_00002265, GC_00002691, GC_00002814, GC_0000...",0.0256,0.7778,39,18,0.04996,0.789495,-4.323083,-0.340998,3.982084
110,"Acyl-CoA thioesterase PaaI, contains HGG motif...",11.876367,0.0005685073,0.006721467,Chennai,COG2050,"GC_00002687, GC_00002810",0.0,0.2778,39,18,0.025,0.315811,-5.321928,-1.662869,3.659059
111,"Transposase and inactivated derivatives, IS30 ...",11.876367,0.0005685073,0.006721467,Chennai,COG2826,"GC_00002688, GC_00002742",0.0,0.2778,39,18,0.025,0.315811,-5.321928,-1.662869,3.659059


In [9]:
Stutzerimonas_stutzeri_df.head()

Unnamed: 0,COG20_FUNCTION,enrichment_score,unadjusted_p_value,adjusted_q_value,associated_groups,accession,gene_clusters_ids,p_Reference,p_Chennai,N_Reference,N_Chennai,pseudo_p_Reference,pseudo_p_Chennai,log2_Reference,log2_Chennai,Log2FC
10,Dephospho-CoA kinase (CoaE) (PDB:1JJV),26.840008,2.210141e-07,5e-05,Chennai,COG0237,GC_00003871,0.0,0.9091,20,11,0.047619,0.916675,-4.392317,-0.125518,4.2668
15,Protein translocase subunit SecG (SecG) (PDB:2...,23.057852,1.571992e-06,0.000194,Chennai,COG1314,GC_00003907,0.0,0.8182,20,11,0.047619,0.83335,-4.392317,-0.263006,4.129312
68,Ribosomal protein L36 (RpmJ) (PDB:1DFE) (PUBME...,15.800646,7.037849e-05,0.002516,Chennai,COG0257,GC_00003895,0.05,0.7273,20,11,0.095238,0.750025,-3.392317,-0.414989,2.977328
222,Ribosomal protein S27E (RPS27A) (PDB:1QXF),8.350259,0.003856323,0.042471,Chennai,COG2051,GC_00004419,0.0,0.3636,20,11,0.047619,0.416633,-4.392317,-1.26315,3.129168
223,TolB amino-terminal domain (function unknown) ...,8.350259,0.003856323,0.042471,Chennai,COG5616,GC_00004349,0.0,0.3636,20,11,0.047619,0.416633,-4.392317,-1.26315,3.129168
