### Finding the selection and fixation stats for genes found in windows under selection and fixation

In this Jupyter notebook we look at each gene that found in the genomic regions that had significantly high (above 99th percentile) Fst values and Garuds H12 statistics. Here, we will be calculating Hudson's Fst value, Garuds H12 and Tajima's D values for each gene separately using scikit-allel python library.

The following script will read a csv file to get the locaions of each gene and will calculate the stats.

In [1]:
# Importing necessary python libraries

import allel
import numpy as np
import zarr
import os
import csv
import pandas as pd


Saving paths to .ZARR files in list

In [2]:
# providing path to the zarr files
zarrPaths = ['/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_allChrs_fltpass.zarr',
            '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chrx_BAP.zarr',
            '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chr2_BAP.zarr',
            '/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chr3_BAP.zarr',]

zarrPaths[0]

'/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_allChrs_fltpass.zarr'

### Defining the indices of samples in each population

In [3]:
# Define populations (indices of individuals in each population)
India_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]  # Replace with indices of population 1
Ethiopia_indices = [20, 21, 22, 23, 24, 25, 26, 27, 37, 38, 39, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]  # Replace with indices of population 2
Yemen_indices = [28, 29, 30, 31, 32, 33, 34, 35, 36, 40, 41, 42]  # Replace with indices of population 3


### Defining a function to calculate Fst

In [None]:
def calculate_fst_h12_from_zarr(zarr_file, start, end, pop1_indices, pop2_indices):
    """
    Calculate Fst using the Weir & Cockerham method for a specific region.

    Parameters:
        zarr_file (str): Path to the .zarr file containing genotype data.
        start (int): Start position of the region (0-based).
        end (int): End position of the region (1-based).
        pop1_indices (list of int): Indices of samples belonging to population 1.
        pop2_indices (list of int): Indices of samples belonging to population 2.

    Returns:
        float: Average Fst value for the selected region.
    """

    # Load the genotype data from the Zarr file
    callset = zarr.open_group(zarr_file, mode='r')

    # Extract variant positions
    positions = allel.SortedIndex(callset['variants/POS'][:])

    # Identify variant indices within the given region
    region_mask = (positions >= start) & (positions <= end)
    variant_indices = np.where(region_mask)[0]

    if len(variant_indices) == 0:
        print(f"No variants found in the region {start}-{end}. Returning NaN.")
        return np.nan

    # Extract genotype data for the selected region
    try:
        genotype_array = allel.GenotypeArray(callset['calldata/GT'][:])  # Ensure 3D format
        genotype_region = genotype_array[variant_indices]  # Subset for selected region
    except Exception as e:
        print(f"Error loading genotype data: {e}")
        return np.nan

    # Validate genotype shape
    if genotype_region.ndim != 3 or genotype_region.shape[2] != 2:
        print(f"Unexpected genotype shape: {genotype_region.shape}. Returning NaN.")
        return np.nan

    # Compute allele counts for each population
    ac1 = genotype_region.count_alleles(subpop=pop1_indices)
    ac2 = genotype_region.count_alleles(subpop=pop2_indices)

    # Ensure non-empty allele counts
    if ac1.shape[0] == 0 or ac2.shape[0] == 0:
        print("Empty allele counts for the region. Returning NaN.")
        return [np.nan, "0"]

    # Calculate Hudson's Fst
    fst_hudson, _, _, _ = allel.average_hudson_fst(ac1, ac2, len(variant_indices)) # Can change between Hudson and WC 1984

    # Return average Fst value for the region
    #return np.nanmean(fst_wc)  # Use np.nanmean to avoid NaN issues
    #return [fst_hudson, str(len(variant_indices))]

    # **Calculate Garud's H12 statistic**
    try:
        # Extract haplotypes from the genotype array
        haplotypes = genotype_region.to_haplotypes()

        # Compute haplotype frequencies
        h12_results = allel.garud_h(haplotypes)

        # Extract the H12 statistic
        h12_stat = h12_results[1]  # H12 is the second value in the returned tuple
    except Exception as e:
        print(f"Error calculating H12: {e}")
        h12_stat = np.nan

    # Return Fst, H12, and the number of variants in the region
    return [fst_hudson, h12_stat, str(len(variant_indices))]


In [16]:
# Method testing block
# Calculating Fst value
stats_values = calculate_fst_h12_from_zarr('/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/YmEthInd_Chr2_BAP.zarr', 67271604, 67277466, Ethiopia_indices, India_indices)
print(stats_values)


[0.5696539238514641, 0.6195790816326533, '159']


### Writing a for loop to iterate over the CSV files with gene details and calculate Fst values


In [19]:
# Giving path to the directory with CSV files
csv_dir = "/data/gunarathnai/Ans_GA/gatk_variants/YmEthSmlInd_variants/Zarr_files/Det_genes_fixedNselected_filtered"

# Getting the names of the csv files
csv_files = [file for file in os.listdir(csv_dir) if file.endswith('.csv')]

# Iterating over the CSV files
for file_name in csv_files:
    file_path = os.path.join(csv_dir, file_name)
    csvdf = pd.read_csv(file_path)
    pop1 = file_name.split("_")[1]
    pop2 = file_name.split("_")[3]
    chrname = file_name.split("_")[4]
    chrname = chrname.split(".")[0]
    print("Population 1 - " + pop1 + " Population 2 - " + pop2 + " Chromosome name - " + chrname)
    
    # Creating a new list to hold new row list with Fst values and number of variants in the gene
    new_rows_list = []
    
    # Getting column names of the data frame
    df_col_names = csvdf.columns
    
    # Iterating over the rows of the data frame
    for index, row in csvdf.iterrows():
        row_list = row.to_list()
        start_position = row_list[1]
        end_position = row_list[2]
        pop1_sample_indices = []  # Indices of Population 1
        pop2_sample_indices = []  # Indices of Population 2
        
        # Setting the population 1
        if pop1 == "Yemen":
            pop1_sample_indices = Yemen_indices
        elif pop1 == "Ethiopia":
            pop1_sample_indices = Ethiopia_indices
        elif pop1 == "India":
            pop1_sample_indices = India_indices
        elif pop1 == "YemenNEthiopia":
            pop1_sample_indices = Yemen_indices + Ethiopia_indices
        
        # Setting the population 2
        if pop2 == "Yemen":
            pop2_sample_indices = Yemen_indices
        elif pop2 == "Ethiopia":
            pop2_sample_indices = Ethiopia_indices
        elif pop2 == "India":
            pop2_sample_indices = India_indices
        elif pop2 == "YemenNEthiopia":
            pop2_sample_indices = Yemen_indices + Ethiopia_indices
        
        # Picking the correct zarr path based on the chromosome
        if chrname == "ChrX":
            zarr_path = zarrPaths[1]
        elif chrname == "Chr2":
            zarr_path = zarrPaths[2]
        elif chrname == "Chr3":
            zarr_path = zarrPaths[3]
        
        # Calculating Fst value
        stats_values = calculate_fst_h12_from_zarr(zarr_path, start_position, end_position, pop1_sample_indices, pop2_sample_indices)
        
        # adding the Fst value and number of variants to the row list
        if isinstance(stats_values, list) and len(stats_values) == 3:
            row_list = row_list + stats_values
        else:
            print(f"Unexpected return type from function: {type(stats_values)}. Fixing...")
            row_list = row_list + [np.nan, np.nan, "0"]  # Ensure consistent list size
        
        # adding the new row to new row list
        new_rows_list.append(row_list)
    
    # Creating a data frame from the new rows list
    new_df = pd.DataFrame(new_rows_list, columns=list(df_col_names) + ["Fst_value", "H12_value", "number_of_variants"])
    # print(new_df.head)
    
    # Saving the new data frame as 
    new_df.to_csv(file_name.split(".")[0] + "_WFstNH12.csv", index=False)
        




Population 1 - Ethiopia Population 2 - India Chromosome name - Chr3


  vj = np.array(vj)


Population 1 - Ethiopia Population 2 - Yemen Chromosome name - Chr2
Population 1 - YemenNEthiopia Population 2 - India Chromosome name - Chr3


  vj = np.array(vj)


Population 1 - Yemen Population 2 - India Chromosome name - ChrX
Population 1 - YemenNEthiopia Population 2 - India Chromosome name - Chr2


  vj = np.array(vj)


Population 1 - Ethiopia Population 2 - Yemen Chromosome name - Chr3


  vj = np.array(vj)


Population 1 - YemenNEthiopia Population 2 - India Chromosome name - ChrX
Population 1 - Yemen Population 2 - India Chromosome name - Chr3


  vj = np.array(vj)


Population 1 - YemenNEthiopia Population 2 - India Chromosome name - ChrX
Population 1 - YemenNEthiopia Population 2 - India Chromosome name - Chr3


  vj = np.array(vj)
  vj = np.array(vj)


Population 1 - Yemen Population 2 - India Chromosome name - Chr2


  vj = np.array(vj)


Population 1 - Ethiopia Population 2 - Yemen Chromosome name - ChrX
No variants found in the region 7933935-7939243. Returning NaN.
Unexpected return type from function: <class 'float'>. Fixing...
Population 1 - Ethiopia Population 2 - India Chromosome name - ChrX
Population 1 - YemenNEthiopia Population 2 - India Chromosome name - Chr2


  vj = np.array(vj)


Population 1 - Ethiopia Population 2 - India Chromosome name - Chr2


  vj = np.array(vj)


Population 1 - Ethiopia Population 2 - Yemen Chromosome name - Chr2


  vj = np.array(vj)


Population 1 - Ethiopia Population 2 - Yemen Chromosome name - ChrX
Population 1 - Ethiopia Population 2 - Yemen Chromosome name - Chr3


  vj = np.array(vj)
  vj = np.array(vj)
