# Performing selection analysis on all chromosomes using variants saved in Zarr files
As the first step loading zarr files and loading necessary libraries

In [2]:
# Improting libraries
import allel
import zarr
import numcodecs
import pandas as pd

In [3]:
# providing path to the zarr files
allChrs_zarr_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_allChrs_fltpass.zarr'
chrx_zarr_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chrx_BAP.zarr'
chr2_zarr_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chr2_BAP.zarr'
chr3_zarr_path = '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chr3_BAP.zarr'

Opening zarr files

In [3]:
chrx_callset = zarr.open_group('YmEthInd_Chrx_BAP.zarr', mode='r')
chrx_callset
chr2_callset = zarr.open_group('YmEthInd_Chr2_BAP.zarr', mode='r')
chr2_callset
chr3_callset = zarr.open_group('YmEthInd_Chr3_BAP.zarr', mode='r')
chr3_callset

<zarr.hierarchy.Group '/' read-only>

In [10]:
# chrx_callset.tree(expand=True)

In [12]:
# Load genotype data (GT - shape: variants x samples x ploidy)
chrx_genotypes = allel.GenotypeArray(chrx_callset['calldata/GT'])
chr2_genotypes = allel.GenotypeArray(chr2_callset['calldata/GT'])
chr3_genotypes = allel.GenotypeArray(chr3_callset['calldata/GT'])

# Load sample names (optional, for population assignment)
samples = chrx_callset['samples'][:]
samples

array(['SRR15257906', 'SRR15257907', 'SRR15257908', 'SRR15257909',
       'SRR15257910', 'SRR15257911', 'SRR15257912', 'SRR15257913',
       'SRR15257914', 'SRR15257915', 'SRR15293885', 'SRR15293886',
       'SRR15293887', 'SRR15293888', 'SRR15293889', 'SRR15293890',
       'SRR15293891', 'SRR15293892', 'SRR15293893', 'SRR15293894',
       'X1296', 'X1307', 'X1402', 'X1403', 'X1404', 'X1408', 'X1409',
       'X1410', 'X1415', 'X1416', 'X1417', 'X1419', 'X1420', 'X1421',
       'X1423', 'X1424', 'X1425', 'X1580', 'X1581', 'X1583', 'X1585',
       'X1586', 'X1587', 'X1604', 'X1605', 'X1673', 'X1676', 'X1679',
       'X1680', 'X1735', 'X1736', 'X1738', 'X1740', 'X1742', 'X1743',
       'X1747'], dtype=object)

# Extract Population Data
Garud's H statistics require haplotypes, which means you need to group individuals by population and extract phased haplotype data. Assuming you have predefined population groupings:

In [13]:
# Define populations (indices of individuals in each population)
India_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]  # Replace with indices of population 1
Ethiopia_indices = [20, 21, 22, 23, 24, 25, 26, 27, 37, 38, 39, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]  # Replace with indices of population 2
Yemen_indices = [28, 29, 30, 31, 32, 33, 34, 35, 36, 40, 41, 42]  # Replace with indices of population 3

# Subset genotypes by population
chrx_genotypes_India = chrx_genotypes.take(India_indices, axis=1)
chr2_genotypes_India = chr2_genotypes.take(India_indices, axis=1)
chr3_genotypes_India = chr3_genotypes.take(India_indices, axis=1)

chrx_genotypes_Ethiopia = chrx_genotypes.take(Ethiopia_indices, axis=1)
chr2_genotypes_Ethiopia = chr2_genotypes.take(Ethiopia_indices, axis=1)
chr3_genotypes_Ethiopia = chr3_genotypes.take(Ethiopia_indices, axis=1)

chrx_genotypes_Yemen = chrx_genotypes.take(Yemen_indices, axis=1)
chr2_genotypes_Yemen = chr2_genotypes.take(Yemen_indices, axis=1)
chr3_genotypes_Yemen = chr3_genotypes.take(Yemen_indices, axis=1)

# Convert Genotypes to Haplotypes
Garud's H statistics require haplotype data. Convert diploid genotypes into haplotypes.

In [14]:
# Convert genotypes to haplotypes (unphased or phased)
chrx_haplotypes_India = chrx_genotypes_India.to_haplotypes()
chr2_haplotypes_India = chr2_genotypes_India.to_haplotypes()
chr3_haplotypes_India = chr3_genotypes_India.to_haplotypes()

chrx_haplotypes_Ethiopia = chrx_genotypes_Ethiopia.to_haplotypes()
chr2_haplotypes_Ethiopia = chr2_genotypes_Ethiopia.to_haplotypes()
chr3_haplotypes_Ethiopia = chr3_genotypes_Ethiopia.to_haplotypes()

chrx_haplotypes_Yemen = chrx_genotypes_Yemen.to_haplotypes()
chr2_haplotypes_Yemen = chr2_genotypes_Yemen.to_haplotypes()
chr3_haplotypes_Yemen = chr3_genotypes_Yemen.to_haplotypes()

# Calculate Garud's H Statistics
Use allel.garud_h to calculate Garud's H statistics. This requires a haplotype array (variants x haplotypes). If you want to calculate Garud's H statistics in sliding windows along the genome, use allel.moving_garud_h: to Perform Sliding Window Analysis. Refer - https://scikit-allel.readthedocs.io/en/stable/stats/selection.html#haplotype-diversity-garud-s-h-statistics

In [15]:
# Define populations and chromosomes
populations = ["India", "Ethiopia", "Yemen"]
chromosomes = ["chrx", "chr2", "chr3"]

# Define window size (e.g., 100,000 variants) and step size (e.g., 50,000 variants)
window_size = 1000
step_size = 500

for pop in populations:
    for chrom in chromosomes:
        
        # Construct the variable name dynamically
        haplotypes_var_name = f"{chrom}_haplotypes_{pop}"
            
        # Access the variable from the local namespace
        haplotypes_pop = locals()[haplotypes_var_name]

        # Calculate Garud's H statistics in sliding windows for population 1
        h1_values, h12_values, h123_values, h2_h1_values = allel.moving_garud_h(
            haplotypes_pop,
            size=window_size,
            step=step_size
        )

        # Create a DataFrame for sliding window results
        df = pd.DataFrame({
            'H1': h1_values,
            'H12': h12_values,
            'H123': h123_values,
            'H2/H1': h2_h1_values
        })

        # Save to a CSV file
        df.to_csv(f"garuds_h_statistics_{pop}_{chrom}.csv", index=False)