# Script: Beta Diversity with PCA-enhanced db-RDA

## Purpose

This script performs beta diversity analysis corrected by a null model, applies PCA-reduced db-RDA, and visualizes the relationship between microbial communities and environmental factors.

## Key Features

- Bray-Curtis distance and SES computation
- PCA-reduced environmental variables for db-RDA
- R² and adjusted R² calculation
- Group-wise diversity visualization
- PCoA and KDE-based ellipses
- Environmental influence plots

## Outputs

- `db_rda_results.txt`: SES, p-value, R², R²_adj
- `Within_Group_Beta_Diversity.png`
- `Variation_Contribution_Fixed.png`
- `PCoA_Scatter.png`
- `PCoA_With_Ellipses.png`
- `Environmental_Factors_vs_Beta_Diversity.png`

## Dependencies

- pandas, numpy, matplotlib, seaborn
- scikit-bio, statsmodels, sklearn


## 🧪 PCA-Enhanced db-RDA Analysis on Beta Diversity

This script analyzes microbial beta diversity using null-model correction and performs db-RDA using PCA-reduced environmental variables.

### Features

- Bray-Curtis beta diversity matrix
- Null model-based SES calculation
- db-RDA with PCA dimension reduction
- Multiple visualizations including boxplots, scatter, KDE ellipses, and environmental overlays

### Usage

```bash
python scripts/beta_diversity_pca_dbrda_analysis.py


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
from skbio.stats.distance import DistanceMatrix
from scipy.stats import ttest_1samp
import statsmodels.api as sm
import seaborn as sns

# File paths
relative_abundance_file = r'.../grelative.xls.csv'
env_factors_file = r'.../Enviroment elements B.csv'
output_folder = r'.../BetaDiversityResults'
os.makedirs(output_folder, exist_ok=True)

# Load OTU and environmental data
otu_df = pd.read_csv(relative_abundance_file, index_col=0)
env_factors = pd.read_csv(env_factors_file, index_col=0)

# Align samples across OTU and environmental metadata
common_samples = otu_df.columns.intersection(env_factors.index)
otu_df = otu_df[common_samples]
env_factors = env_factors.loc[common_samples]

# Normalize OTU relative abundances
otu_df = otu_df.div(otu_df.sum(axis=0), axis=1)

# Null model-based beta diversity and SES
def calculate_beta_diversity_with_null_model(data, metric="braycurtis", num_iterations=100):
    observed_dist = beta_diversity(metric, data.values.T, ids=data.columns)
    observed_values = observed_dist.to_data_frame().values.flatten()

    null_distributions = []
    for _ in range(num_iterations):
        randomized_data = data.apply(np.random.permutation, axis=0)
        null_dist = beta_diversity(metric, randomized_data.values.T, ids=randomized_data.columns)
        null_distributions.append(null_dist.to_data_frame().values.flatten())

    null_distributions = pd.DataFrame(null_distributions)
    null_mean = null_distributions.mean(axis=1).mean()
    null_std = null_distributions.values.flatten().std()
    observed_mean = observed_values.mean()
    ses = (observed_mean - null_mean) / null_std
    t_stat, p_value = ttest_1samp(null_distributions.mean(axis=1), observed_mean)
    return observed_dist, ses, p_value

# Calculate beta diversity and SES
observed_dist, ses, p_value = calculate_beta_diversity_with_null_model(otu_df)
print(f"SES: {ses:.4f}, p-value: {p_value:.4f}")

# PCoA ordination
distance_matrix = DistanceMatrix(observed_dist.to_data_frame().values, ids=otu_df.columns)
pcoa_results = pcoa(distance_matrix)

# Distance-based redundancy analysis (db-RDA)
def perform_dbrda(distance_matrix, env_data):
    pcoa_axes = pcoa(distance_matrix).samples
    common_samples = pcoa_axes.index.intersection(env_data.index)
    pcoa_axes = pcoa_axes.loc[common_samples]
    env_data = env_data.loc[common_samples]

    # Convert environmental variables to dummies and standardize
    env_data_dummies = pd.get_dummies(env_data, drop_first=True)
    scaler = StandardScaler()
    env_data_dummies = pd.DataFrame(scaler.fit_transform(env_data_dummies),
                                    index=env_data_dummies.index,
                                    columns=env_data_dummies.columns)

    # PCA for dimension reduction
    pca = PCA(n_components=min(5, env_data_dummies.shape[1]))
    env_data_reduced = pd.DataFrame(pca.fit_transform(env_data_dummies), index=env_data_dummies.index)
    print(f"Shape after PCA: {env_data_reduced.shape}")

    pcoa_axes = pcoa_axes.iloc[:, :5]

    results = []
    for i in range(pcoa_axes.shape[1]):
        y = pcoa_axes.iloc[:, i]
        model = sm.OLS(y, sm.add_constant(env_data_reduced)).fit()
        results.append(model.rsquared)

    R2 = np.mean(results)
    R2_adj = 1 - (1 - R2) * (len(common_samples) - 1) / (len(common_samples) - env_data_reduced.shape[1] - 1)
    return R2, R2_adj

# Run db-RDA and save results
try:
    R2, R2_adj = perform_dbrda(distance_matrix, env_factors)
    print(f"R²: {R2:.4f}, Adjusted R²: {R2_adj:.4f}")
    with open(os.path.join(output_folder, "db_rda_results.txt"), "w") as f:
        f.write(f"SES: {ses:.4f}\n")
        f.write(f"p-value: {p_value:.4f}\n")
        f.write(f"R²: {R2:.4f}\n")
        f.write(f"Adjusted R²: {R2_adj:.4f}\n")
except ValueError as e:
    print(f"db-RDA error: {e}")

# Plot: within-group beta diversity
observed_df_long = observed_dist.to_data_frame().stack().reset_index()
observed_df_long.columns = ['Sample1', 'Sample2', 'Distance']
observed_df_long['Group1'] = observed_df_long['Sample1'].map(env_factors['Depth'])
observed_df_long['Group2'] = observed_df_long['Sample2'].map(env_factors['Depth'])
observed_df_long = observed_df_long[observed_df_long['Group1'] == observed_df_long['Group2']]

plt.figure(figsize=(10, 6))
sns.boxplot(data=observed_df_long, x='Group1', y='Distance', palette='RdYlBu')
plt.ti
