In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

%matplotlib inline 
sns.set_theme(style="whitegrid") 

#Pathways
INPUT_FILE = "/home/marcos/PRJEB59406/tabela_mestra_mag_sialidase_taxonomy.xlsx"
OUTPUT_DIR = "/home/marcos/PRJEB59406/mag_annotation/plots"

os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
try:
    df = pd.read_excel(INPUT_FILE)
    
    # Function to parse taxonomy
    def parse_taxonomy(row):
        """Quebra a string 'd__Bacteria;p__Firmicutes...' em colunas"""
        if pd.isna(row): return pd.Series(['Unknown', 'Unknown'])
        
        parts = row.split(';')
        tax_dict = {}
        for part in parts:
            if '__' in part:
                rank, name = part.split('__', 1) 
                tax_dict[rank] = name
        
        # Phylum (p) and Genus (g) extraction - default to 'Unknown' if not found
        return pd.Series([tax_dict.get('p', 'Unknown'), tax_dict.get('g', 'Unknown')])

    
    df[['Phylum', 'Genus']] = df['classification'].apply(parse_taxonomy)
    
    df['Phylum'] = df['Phylum'].replace('', 'Unknown')
    
    print(f"MAGs: {len(df)}")
    
    display(df[['mag_id', 'Phylum', 'Genus', 'has_sialidase']].head())

except Exception as e:
    print(f"Erro: {e}")

In [None]:
plt.figure(figsize=(10, 8))

# Counts per Phylum and order
order = df['Phylum'].value_counts().index

sns.countplot(data=df, y='Phylum', order=order, palette='viridis')

plt.title("Diversity of Assembled MAGs (Phylum Level)", fontsize=16)
plt.xlabel("Number of MAGs")
plt.ylabel("Phylum")

# Salva e Mostra
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/mag_distribution_phylum.png", dpi=300)
plt.show()

In [None]:
# Filter to top 12 phyla for clarity in comparative plot 
top_phyla = df['Phylum'].value_counts().head(12).index
df_filtered = df[df['Phylum'].isin(top_phyla)]

plt.figure(figsize=(12, 8))

# comparative plot with hue for sialidase presence
sns.countplot(data=df_filtered, y='Phylum', hue='has_sialidase', 
              order=top_phyla, 
              palette={'YES': '#e74c3c', 'NO': '#95a5a6', 'No Data': '#ecf0f1'}) 
              # Red for YES, Gray for NO, Light Gray for No Data

plt.title("Sialidase Presence Across Major Bacterial Phyla", fontsize=16)
plt.xlabel("Number of MAGs")
plt.ylabel("Phylum")
plt.legend(title="Sialidase Status", loc='lower right')

# Salva e Mostra
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/mag_sialidase_presence.png", dpi=300)
plt.show()

# Extra: Mostra quem s√£o os positivos
print("List of Sialidase-positive Genera:")
sialidase_positives = df[df['has_sialidase'] == 'YES']['Genus'].unique()
print(sialidase_positives)