In [None]:
import pandas as pd
import numpy as np
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
import os

# ── 1. Load data
enrichment_file = "Figure_3B_data.csv"
clan_df = pd.read_csv(enrichment_file)

# Total number of domains in dataset (X) and FDR-passed domains (S)
total_domains = 11533
selected_domains = 1288

# ── 2. Calculate Expected values ──
clan_df['Expected'] = (selected_domains / total_domains) * clan_df['Total']

# ── 3. Compute hypergeometric P-values ──
p_values = []
for idx, row in clan_df.iterrows():
    clan_total = row['Total']
    observed_selected = row['Observed']
    p_val = hypergeom.sf(observed_selected - 1, total_domains, selected_domains, clan_total)
    p_values.append(p_val)
clan_df['P-value'] = p_values

# ── 4. Save updated enrichment dataframe ──
clan_df.to_csv("Figure_3B_data.csv", index=False)
print("Updated enrichment CSV saved.")

# ── 5. Load Fig3c data ──
fig3c_file = "Figure_3B_data.csv"
fig3c_df = pd.read_csv(fig3c_file)

# Filter for significant clans (P <= 0.05)
clans_significant = fig3c_df[fig3c_df['P-value'] <= 0.05].copy()

# Apply log10 transformation
clans_significant['Observed_log10'] = np.log10(clans_significant['Observed'])
clans_significant['Expected_log10'] = np.log10(clans_significant['Expected'])

# ── 6. Plot Observed vs Expected domains (log10) ──
plt.figure(figsize=(6, 8))

plt.plot(clans_significant['Observed_log10'], clans_significant['Clans'],
         label='Observed (log10)', color='red', linewidth=1)
plt.plot(clans_significant['Expected_log10'], clans_significant['Clans'],
         label='Expected (log10)', color='blue', linewidth=1)

plt.xlabel('Domains count (log10)', fontsize=8, family='Arial')
plt.ylabel('Clans', fontsize=8, family='Arial')
plt.xticks(fontsize=8, family='Arial')
plt.yticks(fontsize=8, family='Arial')
plt.legend(fontsize=8)
plt.grid(True)
plt.tight_layout()


# ── 7. Save figure
for fmt in ['pdf']:
    output_path = f'Figure_3B.{fmt}'
    plt.savefig(output_path, format=fmt, dpi=1200)
    print(f"Saved: {output_path}")

plt.show()

