In [None]:
import pandas as pd
from scipy.stats import pointbiserialr
from statsmodels.stats.multitest import fdrcorrection

#FILE PATH 
binary_file = "ALL_binary_encoded_domains.xlsx"  

# Load the file 
df_binary = pd.read_excel(binary_file, engine='openpyxl')
print("✅ File loaded successfully!")
print("Shape:", df_binary.shape)
print("First 5 rows:")
display(df_binary.head())

# POINT-BISERIAL CORRELATION ANALYSIS 
print("Starting analysis...")
print(f"Data shape: {df_binary.shape}")

# Extract domains and PIRACopyNumber
domains = df_binary.iloc[:, 2:11536]  # Columns 3 to 11536
pcn = df_binary['PIRACopyNumber']

# 1. Point-Biserial Calculation with progress tracking
results = []
total_domains = len(domains.columns)
print(f"\nCalculating point-biserial correlations for {total_domains} domains...")

for i, domain in enumerate(domains.columns, 1):
    r, p = pointbiserialr(domains[domain], pcn)
    results.append({"Domain": domain, "Correlation": r, "p_value": p})
    
    if i % 500 == 0 or i == total_domains:
        print(f"Processed {i}/{total_domains} domains ({(i/total_domains)*100:.1f}%)")

# 2. Create DataFrame from results
results_df = pd.DataFrame(results)

# 3. Apply FDR correction
results_df['q_value'] = fdrcorrection(results_df['p_value'])[1]

# 4. Filter significant domains (q < 0.05)
significant_domains = results_df[results_df['q_value'] < 0.05].sort_values('Correlation', 
                                                                            key=abs, 
                                                                            ascending=False)

# 5. Save results
raw_output_path = "point_biserial_raw_results.csv"
sig_output_path = "significant_domains_post_fdr.csv"

results_df.to_csv(raw_output_path, index=False)
significant_domains.to_csv(sig_output_path, index=False)

# 6. Print summary
print("\n=== Results Summary ===")
print(f"Total domains analyzed: {len(results_df)}")
print(f"Significant domains (q < 0.05): {len(significant_domains)}")
print(f"Strongest positive correlation: {significant_domains['Correlation'].max():.3f}")
print(f"Strongest negative correlation: {significant_domains['Correlation'].min():.3f}")
print(f"\nRaw results saved to: {raw_output_path}")
print(f"Significant domains saved to: {sig_output_path}")

# Show top 5 significant domains
print("\nTop 5 most significant domains:")
display(significant_domains.head())
