## Finding the genes within the Garud's H1 and H12 windows

Here we are trying to find the genes that falls within the significant (based on the Garud's H statistic cut-off values) windows of Garud's H statistics (H1 and H12).
When the Garud's H statistics were calculated using scikit-allel package we had to use the sliding window method. The width of the window was given as a number of variants instead of base pair length.
Therefore we had to get the postions of the variants of the VCF files that were used as input to the Garud's H static calculation and find the starting and ending positions of the windows.

In [1]:
# Importing necessary libraries

import numpy as np
import pandas as pd
import os
import re

#### Importing the necessary data files

In [2]:
### Reading the H statistics data

# Define the folder path where the CSV files are stored
folder_path = "C:/Users/aaisu/Box/Carter Lab/Projects/Anstep_GA_Yemen/Selection_stats_12.15.2024/Garuds_Hstat/Hstat_data"

# Create a dictionary to store DataFrames, using file names (without extension) as keys
dataframes = {}

# Loop through all files in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a CSV
    if file_name.endswith(".csv"):
        # Construct the full file path
        file_path = os.path.join(folder_path, file_name)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Store the DataFrame in the dictionary
        # Use the file name without the extension as the key
        key = os.path.splitext(file_name)[0]
        dataframes[key] = df

# Example: Accessing a specific DataFrame by its key
for key, df in dataframes.items():
    print(f"DataFrame from file {key}:")
    print(df.head())  # Display the first 5 rows
    print()


### Reading variant positions
# Define the folder path where the CSV files are stored
folder_path = "C:/Users/aaisu/Box/Carter Lab/Projects/Anstep_GA_Yemen/Selection_stats_12.15.2024/Garuds_Hstat/"

# Create a dictionary to store DataFrames, using file names (without extension) as keys
variant_positions = {}

# Column names to be assigned
column_names = ['CHROM', 'POS', 'REF', 'ALT']

# Loop through all files in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a CSV
    if file_name.endswith(".txt"):
        # Construct the full file path
        file_path = os.path.join(folder_path, file_name)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path, sep=' ', header=None, names=column_names)
        
        # Store the DataFrame in the dictionary
        # Use the file name without the extension as the key
        key = os.path.splitext(file_name)[0]
        variant_positions[key] = df

# Example: Accessing a specific DataFrame by its key
for key, df in variant_positions.items():
    print(f"DataFrame from file {key}:")
    print(df.head())  # Display the first 5 rows
    print()

DataFrame from file garuds_h_statistics_Ethiopia_chr2:
         H1       H12      H123     H2/H1
0  0.073785  0.117188  0.156250  0.411765
1  0.073785  0.108507  0.144965  0.411765
2  0.091146  0.161458  0.208333  0.614286
3  0.096354  0.174479  0.223958  0.549550
4  0.072049  0.118924  0.157986  0.512048

DataFrame from file garuds_h_statistics_Ethiopia_chr3:
         H1       H12      H123     H2/H1
0  0.052951  0.078993  0.126736  0.704918
1  0.049479  0.071181  0.105903  0.780702
2  0.054688  0.072049  0.103299  0.801587
3  0.064236  0.094618  0.146701  0.668919
4  0.059028  0.083333  0.111979  0.639706

DataFrame from file garuds_h_statistics_Ethiopia_chrx:
         H1       H12      H123     H2/H1
0  0.026910  0.033854  0.039062  0.741935
1  0.032118  0.039931  0.050347  0.878378
2  0.034722  0.042535  0.058160  0.887500
3  0.038194  0.051215  0.072049  0.715909
4  0.034722  0.047743  0.061632  0.687500

DataFrame from file garuds_h_statistics_India_chr2:
        H1      H12     

### Getting the start and end positions of the variant windows on chromosomes

In this step we are sliding a window down the data frame with chromosome positinos of th variants for each chromosome. This window will have a width of 1000 rows and a step size of 500 rows. These values were used because that was size and step size of the window slided to calculate the Garud's H stats.

In [3]:
# Assuming `variant_positions` is the dictionary containing your original DataFrames
# Initialize a dictionary to store the new DataFrames
windowed_data = {}

# Define window width and step size
window_width = 1000
step_size = 500

# Iterate over each DataFrame in the dictionary
for key, df in variant_positions.items():
    # Initialize a list to store start and end values for each window
    window_summary = []

    # Slide the window through the DataFrame
    for start in range(0, len(df), step_size):
        end = start + window_width
        # Ensure the end index does not exceed the DataFrame length
        if end > len(df):
            break

        # Get the start and end CHROM values of the current window
        start_chrom = df.iloc[start]['POS']
        end_chrom = df.iloc[end - 1]['POS']

        # Append the results to the list
        window_summary.append({'Start_POS': start_chrom, 'End_POS': end_chrom})

    # Convert the summary to a DataFrame
    windowed_data[key] = pd.DataFrame(window_summary)

# Example: Accessing a specific DataFrame by its key
for key, df in windowed_data.items():
    print(f"Windowed DataFrame for {key}:")
    print(df.shape)  # Display the the number of rows columns
    print(df.head())  # Display the first 5 rows
    print()


Windowed DataFrame for YmEthInd_NC_050201.1_BAFlt_Pos:
(1488, 2)
   Start_POS  End_POS
0         84   271694
1     165623   330041
2     271863   360050
3     330109   415933
4     360066   453627

Windowed DataFrame for YmEthInd_NC_050202.1_BAFlt_Pos:
(10313, 2)
   Start_POS  End_POS
0      23629   125717
1      79589   160068
2     125742   214186
3     160071   235200
4     214307   257352

Windowed DataFrame for YmEthInd_NC_050203.1_BAFlt_Pos:
(10479, 2)
   Start_POS  End_POS
0      26680   124405
1      77620   155685
2     124513   175623
3     155778   202569
4     175626   223603



### From the variant positions dictionary we need to add the start and end positions to the H statistics data frames.

In [4]:
print(dataframes.keys())
print(windowed_data.keys())

hStat_dfnames = pd.Series(['garuds_h_statistics_Ethiopia_chr2', 'garuds_h_statistics_Ethiopia_chr3', 'garuds_h_statistics_Ethiopia_chrx', 'garuds_h_statistics_India_chr2', 'garuds_h_statistics_India_chr3', 'garuds_h_statistics_India_chrx', 'garuds_h_statistics_Yemen_chr2', 'garuds_h_statistics_Yemen_chr3', 'garuds_h_statistics_Yemen_chrx'], name="hStat_dfs")
varPos_dfnames = pd.Series(['YmEthInd_NC_050202.1_BAFlt_Pos', 'YmEthInd_NC_050203.1_BAFlt_Pos', 'YmEthInd_NC_050201.1_BAFlt_Pos'] * 3, name="varPos_dfs")

hStat_N_varPos = pd.concat([hStat_dfnames, varPos_dfnames], axis=1)
hStat_N_varPos.head

# Initialize a dictionary to store the joined DataFrames
joined_data = {}

# Iterate over each row in the mapping DataFrame
for index, row in hStat_N_varPos.iterrows():
    # Extract the names of the DataFrames from the current row
    hStat_key = row['hStat_dfs']
    varPos_key = row['varPos_dfs']

    # Retrieve the corresponding DataFrames from the dictionaries
    if hStat_key in dataframes and varPos_key in windowed_data:
        df_hStat = dataframes[hStat_key]
        df_varPos = windowed_data[varPos_key]

        # Perform the join operation (e.g., inner join)
        joined_df = pd.concat([df_hStat, df_varPos], axis=1)

        # Store the joined DataFrame in the dictionary
        joined_data[f'{hStat_key}_joined_{varPos_key}'] = joined_df

# Example: Accessing one of the joined DataFrames
for key, df in joined_data.items():
    print(f"Joined DataFrame for {key}:")
    print(df.shape)
    print(df.head())  # Display the first 5 rows
    print(df.tail())  # Display the last 5 rows
    print()

dict_keys(['garuds_h_statistics_Ethiopia_chr2', 'garuds_h_statistics_Ethiopia_chr3', 'garuds_h_statistics_Ethiopia_chrx', 'garuds_h_statistics_India_chr2', 'garuds_h_statistics_India_chr3', 'garuds_h_statistics_India_chrx', 'garuds_h_statistics_Yemen_chr2', 'garuds_h_statistics_Yemen_chr3', 'garuds_h_statistics_Yemen_chrx'])
dict_keys(['YmEthInd_NC_050201.1_BAFlt_Pos', 'YmEthInd_NC_050202.1_BAFlt_Pos', 'YmEthInd_NC_050203.1_BAFlt_Pos'])
Joined DataFrame for garuds_h_statistics_Ethiopia_chr2_joined_YmEthInd_NC_050202.1_BAFlt_Pos:
(10313, 6)
         H1       H12      H123     H2/H1  Start_POS  End_POS
0  0.073785  0.117188  0.156250  0.411765      23629   125717
1  0.073785  0.108507  0.144965  0.411765      79589   160068
2  0.091146  0.161458  0.208333  0.614286     125742   214186
3  0.096354  0.174479  0.223958  0.549550     160071   235200
4  0.072049  0.118924  0.157986  0.512048     214307   257352
             H1       H12      H123     H2/H1  Start_POS   End_POS
10308  0.098090

### In this step we are categorizing the H1 and H12 values based on the percentile they belong to

The input will be the joined_data dictionary and this dictionary will be modified to hold the modified dataframes

In [5]:
# Function to calculate percentiles and categorize
def categorize_by_percentile(df, column_name, percentiles):
    thresholds = df[column_name].quantile(percentiles).to_dict()
    # Create a new column for the category
    category_col = f"{column_name}_category"
    df[category_col] = df[column_name].apply(
        lambda x: (
            "≤ 75%" if x <= thresholds[0.75] else
            "75-90%" if x <= thresholds[0.90] else
            "90-95%" if x <= thresholds[0.95] else
            "95-99%" if x <= thresholds[0.99] else
            "> 99%"
        )
    )
    print(thresholds)
    return df

# Iterate over each DataFrame in the dictionary and categorize
percentiles = [0.75, 0.90, 0.95, 0.99]
for key, df in joined_data.items():
    print(key)
    df = categorize_by_percentile(df, "H1", percentiles)
    df = categorize_by_percentile(df, "H12", percentiles)
    joined_data[key] = df

# Example: Display the modified DataFrame
for key, df in joined_data.items():
    print(f"DataFrame '{key}':")
    print(df)
    print()

garuds_h_statistics_Ethiopia_chr2_joined_YmEthInd_NC_050202.1_BAFlt_Pos
{0.75: 0.0798611111111111, 0.9: 0.1059027777777777, 0.95: 0.1293402777777778, 0.99: 0.23166666666666602}
{0.75: 0.1241319444444444, 0.9: 0.1727430555555555, 0.95: 0.2126736111111111, 0.99: 0.35666666666666597}
garuds_h_statistics_Ethiopia_chr3_joined_YmEthInd_NC_050203.1_BAFlt_Pos
{0.75: 0.0737847222222222, 0.9: 0.09375, 0.95: 0.1085069444444444, 0.99: 0.1597222222222222}
{0.75: 0.1128472222222222, 0.9: 0.1467013888888889, 0.95: 0.1744791666666667, 0.99: 0.25279513888888827}
garuds_h_statistics_Ethiopia_chrx_joined_YmEthInd_NC_050201.1_BAFlt_Pos
{0.75: 0.0850694444444444, 0.9: 0.1119791666666666, 0.95: 0.1354166666666667, 0.99: 0.1979166666666667}
{0.75: 0.1328125, 0.9: 0.1779513888888888, 0.95: 0.2135416666666666, 0.99: 0.2855902777777777}
garuds_h_statistics_India_chr2_joined_YmEthInd_NC_050202.1_BAFlt_Pos
{0.75: 0.02625, 0.9: 0.03125, 0.95: 0.03625, 0.99: 0.0525}
{0.75: 0.02875, 0.9: 0.0375, 0.95: 0.04375, 0.99:

### Tabulating the number of significant H1 and H12 windows for each chromosome for each population
The outputs were saved in to two CSV files named - H1_category_counts.csv and H12_category_counts.csv

In [None]:
# Initialize dictionaries to store category counts
H1_counts = {}
H12_counts = {}

# Process each DataFrame in the dictionary
for key, df in joined_data.items():
    # Count occurrences of each category in H1_category
    H1_counts[key] = df['H1_category'].value_counts().to_dict()
    
    # Count occurrences of each category in H12_category
    H12_counts[key] = df['H12_category'].value_counts().to_dict()

# Convert the results into DataFrames
H1_counts_df = pd.DataFrame(H1_counts).fillna(0).astype(int)
H12_counts_df = pd.DataFrame(H12_counts).fillna(0).astype(int)

# Example: Display the results
print("H1 Category Counts:")
print(H1_counts_df)
print("\nH12 Category Counts:")
print(H12_counts_df)

# You can save these DataFrames to files if needed
H1_counts_df.to_csv("H1_category_counts.csv")
H12_counts_df.to_csv("H12_category_counts.csv")

### Finding the genes that fall within the significant windows of H1 and H12
To do this step we need the genomic positions of the An. stephensi protein coding genes. As the first step we are reading in the details of the An. stephensi PCGs from a text file

In [50]:
# Reading TSV file
pcg_filePath = "Anstep_PCGs_1.12.2025.tsv"

PCG_det = pd.read_csv(pcg_filePath, sep='\t')

Iterating over the rows of each data frame in joined_data dictionary to find the genes that falls within each window of H stats and save it in the data frames of joined_data dictionary data frames as new column

In [None]:
# Creating a new dictionary to hold the output
Hstat_W_genes = {}

# Process each DataFrame in the dictionary
for key, df in joined_data.items():
    chrID = key.split("_")[7] + "_" + key.split("_")[8]

    # Filter PCG_det for the relevant chrID
    filtered_PCG_det = PCG_det[PCG_det['Accession'] == chrID]

    # Create a DataFrame to represent overlaps
    overlaps = (
        pd.merge(
            df[['Start_POS', 'End_POS']].reset_index(),
            filtered_PCG_det[['Begin', 'End', 'Symbol']],
            how='cross'
        )
    )
    
    # Find where windows and genes overlap
    overlaps = overlaps[
        ((overlaps['Start_POS'] <= overlaps['End']) & (overlaps['End'] <= overlaps['End_POS'])) |
        ((overlaps['Start_POS'] <= overlaps['Begin']) & (overlaps['Begin'] <= overlaps['End_POS']))
    ]

    # Group overlapping genes by window index
    gene_groups = (
        overlaps.groupby('index')['Symbol']
        .apply(list)
        .reindex(df.index, fill_value=[])
    )

    # Add the grouped genes as a new column
    df['Genes'] = gene_groups

    # Store the updated DataFrame in the output dictionary
    Hstat_W_genes[key] = df


Saving the output 

In [None]:
# Define the folder to save the files
output_folder = "Hstat_windows_with_genes"
os.makedirs(output_folder, exist_ok=True)

# Save each DataFrame as a CSV file
for key, df in Hstat_W_genes.items():
    file_path = os.path.join(output_folder, f"{key}.csv")
    df.to_csv(file_path, index=False)  # `index=False` to exclude the index

Categorizing the gene lists according to the H1 statistics values

In [None]:
# Define the folder where CSV files will be saved
output_folder = "Hstat_windows_with_genes"
os.makedirs(output_folder, exist_ok=True)


# Initialize a new dictionary of dictionaries to hold the subsets
by_H1_Category_Hstat_W_genes = {}

# Loop over each DataFrame in the original dictionary
for key, df in Hstat_W_genes.items():
    # Get the unique categories in the H1_category column
    categories = df['H1_category'].unique()
    
    # Create a dictionary to hold subsets for the current DataFrame
    by_H1_Category_Hstat_W_genes[key] = {}
    
    # Subset the DataFrame for each category
    for category in categories:
        # filtering the category
        subset_df = df[df['H1_category'] == category]

        # Replace invalid characters in the category name
        sanitized_category = re.sub(r'[<>:"/\\|?*]', '_', category.replace(' ', '_').replace('%', 'pct'))

        # Save the subset DataFrame to a CSV file
        file_name = f"{key}_H1_category_{sanitized_category}.csv"
        file_path = os.path.join(output_folder, file_name)
        subset_df.to_csv(file_path, index=False)

        # Saving to the dictionary
        by_H1_Category_Hstat_W_genes[key][category] = subset_df

# Example: Accessing the subsets
for key, category_dict in by_H1_Category_Hstat_W_genes.items():
    print(f"DataFrame: {key}")
    for category, subset_df in category_dict.items():
        print(f"Category: {category}")
        print(subset_df.head())  # Print the first few rows of the subset


Categorizing the gene lists according to the H1 statistics values

In [None]:
# Define the folder where CSV files will be saved
output_folder = "Hstat_windows_with_genes/Separated_by_H12_Categories"
os.makedirs(output_folder, exist_ok=True)


# Initialize a new dictionary of dictionaries to hold the subsets
by_H12_Category_Hstat_W_genes = {}

# Loop over each DataFrame in the original dictionary
for key, df in Hstat_W_genes.items():
    # Get the unique categories in the H1_category column
    categories = df['H12_category'].unique()
    
    # Create a dictionary to hold subsets for the current DataFrame
    by_H12_Category_Hstat_W_genes[key] = {}
    
    # Subset the DataFrame for each category
    for category in categories:
        # filtering the category
        subset_df = df[df['H12_category'] == category]

        # Replace invalid characters in the category name
        sanitized_category = re.sub(r'[<>:"/\\|?*]', '_', category.replace(' ', '_').replace('%', 'pct'))

        # Save the subset DataFrame to a CSV file
        file_name = f"{key}_H12_category_{sanitized_category}.csv"
        file_path = os.path.join(output_folder, file_name)
        subset_df.to_csv(file_path, index=False)

        # Saving to the dictionary
        by_H12_Category_Hstat_W_genes[key][category] = subset_df

# Example: Accessing the subsets
for key, category_dict in by_H12_Category_Hstat_W_genes.items():
    print(f"DataFrame: {key}")
    for category, subset_df in category_dict.items():
        print(f"Category: {category}")
        print(subset_df.head())  # Print the first few rows of the subset


saving genes lists as text files

In [None]:
# Define the folder where text files will be saved
output_folder = "H12_unique_genes_text_files"
os.makedirs(output_folder, exist_ok=True)

# Loop over the dictionary of dictionaries
for outer_key, inner_dict in by_H12_Category_Hstat_W_genes.items():
    for inner_key, df in inner_dict.items():
        # Extract all the unique genes from the 'Genes' column
        unique_genes = set()
        for gene_list in df['Genes']:
            unique_genes.update(gene_list)  # Combine all genes into the set
        
        # Create a meaningful and sanitized file name
        sanitized_outer_key = re.sub(r'[<>:"/\\|?*]', '_', outer_key)
        sanitized_inner_key = re.sub(r'[<>:"/\\|?*]', '_', inner_key)
        file_name = f"{sanitized_outer_key}_{sanitized_inner_key}_unique_genes.txt".replace(" ", "_")
        file_path = os.path.join(output_folder, file_name)
        
        # Save the unique genes to the text file
        with open(file_path, 'w') as f:
            for gene in sorted(unique_genes):  # Sort genes alphabetically
                f.write(f"{gene}\n")
        
        # Print a message to indicate progress
        print(f"Saved: {file_name}")

### Finding the genes shared between populations