# Finding genes within significant Fst windows
Here we are using the Fst values we got from pairwise Fst analysis on GATK variants from the populations of Yemen, Ethiopia and India. The pairwise Fst values were calculated using Weir and Cockerham method (1984)
As the first step we will find the windows with significantly high Fst values. Then we are going to find the genes that falls within those windows.

In [1]:
# Importing libraries
import os
import pandas as pd
import numpy as np
import re

First we need to import the data files from "C:\Users\aaisu\Box\Carter Lab\Projects\Anstep_GA_Yemen\BA_Phased_Fst_12.14.2024" into a dictionary of pandas data frames.

In [2]:
import pandas as pd
import os

def read_tab_delimited_files(directory_path, dict_name):
    """
    Reads all tab-delimited text files in the specified directory into pandas DataFrames
    and assigns them to the specified variable name in the global scope.

    Args:
        directory_path (str): Path to the directory containing the tab-delimited text files.
        df_name (str): Name of the variable to store the resulting DataFrame dictionary.

    Returns:
        None
    """
    data_frames = {}

    try:
        # Iterate through files in the directory
        for filename in os.listdir(directory_path):
            # Check if the file has a .txt or .tsv extension (commonly used for tab-delimited files)
            if filename.endswith(".fst"):
                file_path = os.path.join(directory_path, filename)

                # Read the file into a pandas DataFrame
                df = pd.read_csv(file_path, sep="\t")

                # Defining data frame name
                dfname = filename.split("_")[1] + "_" + filename.split("_")[2]

                # Store the DataFrame in the dictionary
                data_frames[dfname] = df

                print(f"Successfully read file: {filename}")

        # Assign the resulting dictionary to the given variable name in the global scope
        globals()[dict_name] = data_frames

    except Exception as e:
        print(f"An error occurred: {e}")

for folders in ["Ethiopia_vs_India", "Ethiopia_vs_Yemen", "Yemen_vs_India", "YemenNEthiopia_vs_India"]:
    # Example usage
    if __name__ == "__main__":
        # Specify the directory containing your tab-delimited text files
        directory = folders

        # Specify the name of the variable to store the DataFrame dictionary
        dictionary_name = folders

        # Read the files and assign to the specified variable name
        read_tab_delimited_files(directory, dictionary_name)

        # Access the DataFrame dictionary dynamically
        data_frames_dict = globals()[dictionary_name]

        # Display the first few rows of each DataFrame
        for file_name, df in data_frames_dict.items():
            print(f"\nData from file: {file_name}")
            print(df.head())


Successfully read file: EthiopianSampleNames.txtIndianSampleNames.txtYmEthInd_allChrs_BA_Phased.vcf.gz.vcf.gz_Fst.windowed.weir.fst
Successfully read file: EthiopianSampleNames.txtIndianSampleNames.txtYmEthInd_NC_050201.1_BA_Phased.vcf.gz.vcf.gz_Fst.windowed.weir.fst
Successfully read file: EthiopianSampleNames.txtIndianSampleNames.txtYmEthInd_NC_050202.1_BA_Phased.vcf.gz.vcf.gz_Fst.windowed.weir.fst
Successfully read file: EthiopianSampleNames.txtIndianSampleNames.txtYmEthInd_NC_050203.1_BA_Phased.vcf.gz.vcf.gz_Fst.windowed.weir.fst

Data from file: allChrs_BA
         CHROM  BIN_START  BIN_END  N_VARIANTS  WEIGHTED_FST  MEAN_FST
0  NC_050201.1          1   100000         145      0.161076  0.094346
1  NC_050201.1      50001   150000         241      0.170661  0.113275
2  NC_050201.1     100001   200000         314      0.276383  0.149935
3  NC_050201.1     150001   250000         318      0.317534  0.149758
4  NC_050201.1     200001   300000         364      0.161103  0.092637

Data 

## Finding the genes lies within each Fst window

In this step we are finding the genes that are overlapping with Fst Windows

In [3]:
# Importing the gene details

pcg_filePath = "Anstep_PCGs_1.12.2025.tsv"

PCG_det = pd.read_csv(pcg_filePath, sep='\t')

In [4]:
def map_genes_to_fst_windows(fst_dicts, gene_df):
    """
    Maps gene symbols to Fst windows based on their positions and appends the list of gene names
    to each Fst window in the Fst DataFrame.

    Args:
        fst_dicts (dict): Dictionary of DataFrames containing Fst windows.
        gene_df (pd.DataFrame): DataFrame with gene details, including columns 'Begin', 'End', and 'Symbol'.

    Returns:
        dict: Updated dictionary of DataFrames with a new column 'Genes' containing lists of gene names.
    """
    # Ensure 'Begin', 'End', and 'Symbol' columns exist in gene_df
    required_columns = {'Begin', 'End', 'Symbol'}
    if not required_columns.issubset(gene_df.columns):
        raise ValueError(f"The gene DataFrame must contain the columns: {required_columns}")

    for fst_name, fst_df in fst_dicts.items():
        try:
            # Create a new column for genes in the Fst DataFrame
            fst_df['Genes'] = fst_df.apply(
                lambda row: gene_df[
                    (gene_df['Begin'] >= row['BIN_START']) & (gene_df['End'] <= row['BIN_END']) & (gene_df['Accession'] == row['CHROM'])
                ]['Symbol'].tolist(), axis=1
            )
            print(f"Processed Fst DataFrame: {fst_name}")
        except Exception as e:
            print(f"An error occurred while processing {fst_name}: {e}")

    return fst_dicts

# Running the function on the data frames in dictionaries
for dict in ["Ethiopia_vs_India", "Ethiopia_vs_Yemen", "Yemen_vs_India", "YemenNEthiopia_vs_India"]:
    # Example usage
    if __name__ == "__main__":

        # Read the files and assign to the specified variable name
        globals()[dict] = map_genes_to_fst_windows(globals()[dict], PCG_det)

        # Access the DataFrame dictionary dynamically
        data_frames_dict = globals()[dict]

        # Display the first few rows of each DataFrame
        for file_name, df in data_frames_dict.items():
            print(f"\nData from file: {file_name}")
            print(df.head())


Processed Fst DataFrame: allChrs_BA
Processed Fst DataFrame: NC_050201.1
Processed Fst DataFrame: NC_050202.1
Processed Fst DataFrame: NC_050203.1

Data from file: allChrs_BA
         CHROM  BIN_START  BIN_END  N_VARIANTS  WEIGHTED_FST  MEAN_FST  \
0  NC_050201.1          1   100000         145      0.161076  0.094346   
1  NC_050201.1      50001   150000         241      0.170661  0.113275   
2  NC_050201.1     100001   200000         314      0.276383  0.149935   
3  NC_050201.1     150001   250000         318      0.317534  0.149758   
4  NC_050201.1     200001   300000         364      0.161103  0.092637   

                                               Genes  
0  [LOC118511281, LOC118515097, LOC118513555, LOC...  
1  [LOC118513625, LOC118513494, LOC118511019, LOC...  
2  [LOC118513329, LOC118512064, LOC118512341, LOC...  
3  [LOC118510517, LOC118512672, LOC118511740, LOC...  
4  [LOC118511740, LOC118511794, LOC118515414, LOC...  

Data from file: NC_050201.1
         CHROM  BIN_S

### Finding the weighted Fst windows with significant Fst values

In this step we are running a window down each data frame and adding a column to each data frame which specifies the category of the Fst value it belongs to based on the percentile.

In [5]:
def add_percent_category(data_frames):
    """
    Adds a column 'percent_category' to each DataFrame in the dictionary based on
    the percentiles (75%, 90%, 95%, 99%) of the 'WEIGHTED_FST' column.

    Args:
        data_frames (dict): Dictionary where values are pandas DataFrames.

    Returns:
        None
    """
    for name, df in data_frames.items():
        try:
            # Calculate percentiles
            percentiles = {
                75: df['WEIGHTED_FST'].quantile(0.75),
                90: df['WEIGHTED_FST'].quantile(0.90),
                95: df['WEIGHTED_FST'].quantile(0.95),
                99: df['WEIGHTED_FST'].quantile(0.99),
            }

            # Printing percentiles
            print(percentiles)

            # Define categorization function
            def categorize(value):
                if value >= percentiles[99]:
                    return 'above_99_pct'
                elif value >= percentiles[95]:
                    return 'bw_95N99_pct'
                elif value >= percentiles[90]:
                    return 'bw_90N95_pct'
                elif value >= percentiles[75]:
                    return 'bw_75N90_pct'
                else:
                    return 'below_75_pct'

            # Apply the categorization
            df['percent_category'] = df['WEIGHTED_FST'].apply(categorize)
            print(f"Added 'percent_category' to DataFrame: {name}")

        except Exception as e:
            print(f"An error occurred while processing DataFrame {name}: {e}")


# Applying the function to assign the percentile category to each value in 
for dict in ["Ethiopia_vs_India", "Ethiopia_vs_Yemen", "Yemen_vs_India", "YemenNEthiopia_vs_India"]:

    # Access the DataFrame dictionary dynamically
    print(dict)
    data_frames_dict = globals()[dict]
    add_percent_category(data_frames_dict)

Ethiopia_vs_India
{75: np.float64(0.149153), 90: np.float64(0.19013300000000002), 95: np.float64(0.22529059999999998), 99: np.float64(0.32260711999999997)}
Added 'percent_category' to DataFrame: allChrs_BA
{75: np.float64(0.200837), 90: np.float64(0.26663040000000005), 95: np.float64(0.3193257999999998), 99: np.float64(0.5334296799999999)}
Added 'percent_category' to DataFrame: NC_050201.1
{75: np.float64(0.14612325), 90: np.float64(0.1887789), 95: np.float64(0.22163499999999997), 99: np.float64(0.31356327999999994)}
Added 'percent_category' to DataFrame: NC_050202.1
{75: np.float64(0.137368), 90: np.float64(0.16716940000000002), 95: np.float64(0.19068915), 99: np.float64(0.2530899099999999)}
Added 'percent_category' to DataFrame: NC_050203.1
Ethiopia_vs_Yemen
{75: np.float64(0.08604115), 90: np.float64(0.11826420000000001), 95: np.float64(0.14395840000000001), 99: np.float64(0.21076234000000002)}
Added 'percent_category' to DataFrame: allChrs_BA
{75: np.float64(0.158885), 90: np.float

Dividing the dataframes based on the categories

In [None]:
def split_data_frames_by_category(data_frames):
    """
    Splits DataFrames in a dictionary into sub-dictionaries based on the 'percent_category' column.

    Args:
        data_frames (dict): Dictionary where values are pandas DataFrames.

    Returns:
        dict: A dictionary of dictionaries, where keys are DataFrame names and values
              are dictionaries split by 'percent_category'.
    """
    split_dict = {}

    for name, df in data_frames.items():
        try:
            # Check if 'percent_category' column exists
            if 'percent_category' not in df.columns:
                print(f"DataFrame {name} does not have 'percent_category' column. Skipping.")
                continue

            # Create a sub-dictionary for this DataFrame
            category_dict = {}

            # Group by 'percent_category' and create a dictionary of DataFrames
            for category, group in df.groupby('percent_category'):
                category_dict[category] = group

            # Add the sub-dictionary to the main dictionary
            split_dict[name] = category_dict
            print(f"Split DataFrame {name} into categories.")

        except Exception as e:
            print(f"An error occurred while splitting DataFrame {name}: {e}")

    return split_dict


# Running the function on the data frames in dictionaries
for dict in ["Ethiopia_vs_India", "Ethiopia_vs_Yemen", "Yemen_vs_India", "YemenNEthiopia_vs_India"]:
    # Example usage
    if __name__ == "__main__":
        # Specify the name of the variable to store the DataFrame dictionary
        dictionary_name = dict + "_splitOnPct"

        # Read the files and assign to the specified variable name
        globals()[dictionary_name] = split_data_frames_by_category(globals()[dict])

        # Access the DataFrame dictionary dynamically
        data_frames_dict = globals()[dictionary_name]

        # Display the result structure
    for df_name, categories in data_frames_dict.items():
        print(f"\nDataFrame: {df_name}")
        for category, df in categories.items():
            print(f"Category: {category}, Number of Rows: {len(df)}")

## Saving the lists of genes for each category of percentiles for each chromosome of each population in separate text files.

In [32]:
# Define the folder where text files will be saved
output_folder = "Fst_unique_genes_text_files"
os.makedirs(output_folder, exist_ok=True)

# Running the function on the data frames in dictionaries
for dict in ["Ethiopia_vs_India_splitOnPct", "Ethiopia_vs_Yemen_splitOnPct", "Yemen_vs_India_splitOnPct", "YemenNEthiopia_vs_India_splitOnPct"]:

    # Access the DataFrame dictionary dynamically
    data_frames_dict = globals()[dict]
    
    # Loop over the dictionary of dictionaries
    for outer_key, inner_dict in data_frames_dict.items():
        for inner_key, df in inner_dict.items():
            # Extract all the unique genes from the 'Genes' column
            unique_genes = set()
            for gene_list in df['Genes']:
                unique_genes.update(gene_list)  # Combine all genes into the set
            
            # Create a meaningful and sanitized file name
            sanitized_outer_key = re.sub(r'[<>:"/\\|?*]', '_', outer_key)
            sanitized_inner_key = re.sub(r'[<>:"/\\|?*]', '_', inner_key)
            file_name = f"{dict}_{sanitized_outer_key}_{sanitized_inner_key}_unique_genes.txt".replace(" ", "_")
            file_path = os.path.join(output_folder, file_name)
            
            # Save the unique genes to the text file
            with open(file_path, 'w') as f:
                for gene in sorted(unique_genes):  # Sort genes alphabetically
                    f.write(f"{gene}\n")
            
            # Print a message to indicate progress
            print(f"Saved: {file_name}")

Saved: Ethiopia_vs_India_splitOnPct_allChrs_BA_above_99_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_allChrs_BA_below_75_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_allChrs_BA_bw_75N90_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_allChrs_BA_bw_90N95_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_allChrs_BA_bw_95N99_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_NC_050201.1_above_99_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_NC_050201.1_below_75_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_NC_050201.1_bw_75N90_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_NC_050201.1_bw_90N95_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_NC_050201.1_bw_95N99_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_NC_050202.1_above_99_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_NC_050202.1_below_75_pct_unique_genes.txt
Saved: Ethiopia_vs_India_splitOnPct_NC_050202.1_bw_75N90_pct_unique_g