# Script: Beta Diversity Null Model Analysis

## Purpose

This script performs beta diversity analysis for microbial community data, grouped by sample name prefixes (e.g., A, B, C). It calculates observed beta diversity, generates null distributions through randomization, computes standardized effect sizes (SES), and evaluates statistical significance using a one-sample t-test.

## Workflow

1. Load OTU (relative abundance) table from CSV.
2. Identify sample groups from column names.
3. For each group:
   - Calculate observed beta diversity (Bray-Curtis).
   - Generate null distributions using permutation.
   - Compute SES and p-values.
   - Output results and generate SES visualizations.

## Output

- `*_Observed_BetaDiversity.csv`: Pairwise beta diversity matrix for each group.
- `*_SES_Distribution.png`: Bar plot of SES per group.
- `SES_Results.csv`: Summary table with SES and p-values.
- `SES_Distribution.png`: Group-wise SES comparison.

## Parameters

- **Metric**: `braycurtis` (can be changed to `jaccard`, etc.)
- **Iterations**: 100 (null model resampling)

## Dependencies

- Python: `numpy`, `pandas`, `matplotlib`, `scikit-bio`, `scipy`


## 🧪 Module: Beta Diversity Null Model Analysis

This Python script performs null model-based beta diversity analysis using Bray-Curtis distance. It estimates standardized effect sizes (SES) and tests whether observed beta diversity deviates significantly from a randomized null distribution.

### Features

- Calculates observed beta diversity within groups
- Randomizes OTU table to simulate null expectations
- Computes SES values and p-values
- Generates visualizations for group-wise comparisons

### Run the script

```bash
python scripts/beta_diversity_null_model.py


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skbio.diversity import beta_diversity
from scipy.stats import ttest_1samp

# File paths
input_file = r'..grelative.xls.csv'
output_folder = r'..BetaDiversityResults/'
os.makedirs(output_folder, exist_ok=True)

# Read OTU table and set the first column as index
otu_df = pd.read_csv(input_file)
otu_df.set_index(otu_df.columns[0], inplace=True)

# Extract group identifiers from sample names (e.g., A1, A2 → A)
samples = otu_df.columns
groups = pd.Series(samples.str.extract(r'^([A-Za-z]+)')[0].values, index=samples)

# Organize data by group
grouped_data = {}
for group in groups.unique():
    matching_columns = groups[groups == group].index
    grouped_data[group] = otu_df.loc[:, matching_columns]

# Function to calculate null model corrected beta diversity and SES
def calculate_beta_diversity_with_stats(group_data, group_name, metric="braycurtis", num_iterations=100):
    """
    Compute observed beta diversity, null model distribution, standardized effect size (SES),
    and perform one-sample t-test.
    """
    # Calculate observed beta diversity
    observed_dist = beta_diversity(metric, counts=group_data.values.T, ids=group_data.columns)
    observed_values = observed_dist.to_data_frame().values.flatten()

    # Generate null distribution
    null_distributions = []
    for _ in range(num_iterations):
        randomized_data = group_data.apply(np.random.permutation, axis=0)
        null_dist = beta_diversity(metric, counts=randomized_data.values.T, ids=randomized_data.columns)
        null_distributions.append(null_dist.to_data_frame().values.flatten())
    
    null_distributions = pd.DataFrame(null_distributions)

    # Calculate SES
    null_mean = null_distributions.mean().mean()
    null_std = null_distributions.stack().std()
    observed_mean = observed_values.mean()
    ses = (observed_mean - null_mean) / null_std

    # One-sample t-test
    t_stat, p_value = ttest_1samp(null_distributions.mean(axis=1), observed_mean)
    return observed_dist.to_data_frame(), ses, p_value

# Main analysis loop
results = {}

for group_name, group_data in grouped_data.items():
    print(f"Processing group: {group_name}")

    # Compute beta diversity, SES, and p-value
    observed_df, ses, p_value = calculate_beta_diversity_with_stats(group_data, group_name)
    print(f"Group {group_name}: SES = {ses}, p-value = {p_value}")

    # Save observed beta diversity matrix
    observed_file = os.path.join(output_folder, f"{group_name}_Observed_BetaDiversity.csv")
    observed_df.to_csv(observed_file)

    # Store results
    results[group_name] = {
        "SES": ses,
        "p-value": p_value
    }

    # Visualize SES value for each group
    plt.figure()
    plt.bar(['SES'], [ses])
    plt.title(f'SES Value for {group_name}')
    plt.ylabel('Standardized Effect Size (SES)')
    group_plot_file = os.path.join(output_folder, f"{group_name}_SES_Distribution.png")
    plt.savefig(group_plot_file)
    plt.close()

# Save all group SES results
results_df = pd.DataFrame.from_dict(results, orient='index')
results_file = os.path.join(output_folder, 'SES_Results.csv')
results_df.to_csv(results_file)

# Plot summary of SES values across groups
plt.figure()
results_df['SES'].plot(kind='bar', legend=False)
plt.title('SES Values by Group')
plt.ylabel('Standardized Effect Size (SES)')
plt.xlabel('Group')
summary_plot_file = os.path.join(output_folder, 'SES_Distribution.png')
plt.savefig(summary_plot_file)
plt.show()

print(f"All results have been saved to: {results_file}")
