## Perform Fisher's Exact Tests to test if differences in allele frequencies for population groups

In [7]:
# Import modules and packages

import os
import sys
from scipy.stats import fisher_exact
import pandas as pd
import numpy as np
import itertools
from warnings import simplefilter
from statsmodels.stats.multitest import multipletests


In [2]:
# Set constants and functions 

home_path = str(os.path.dirname(os.getcwd()))
population_clusters = ["SUPER","SUB"]

gene_location_df = pd.read_csv(os.path.join(home_path, "Data_descriptions", "locations.csv"))
genes = gene_location_df.location_name

sample_info_df = pd.read_csv(os.path.join(home_path, "Data_descriptions", "samples.csv"))
sub_populations = sample_info_df.SUB.unique().tolist()
regional_classification = {"ACB":"ACB","ASW":"ASW","GWD":"WA","ESN":"WA","MSL":"WA","MbutiPygmy":"CA","BiakaPygmy":"CA","Mandenka":"WA","Yoruba":"WA","San":"KS","BantuSouthAfrica":"SA","BantuKenya":"EA","YRI":"WA","LWK":"EA"} # SA = South Africa, CA = Central Africa, WA = West Africa, EA = East Africa, ACB = African Carribean in Barbados, ASW = African American, KS = Khoi-San

def add_prefix_dataframe_col_names(dataframe, columns_list, prefix):
    rename_dict = dict()
    for line in columns_list:
        key = line
        if key not in rename_dict:
            rename_dict[key] = str()
        rename_dict[key] = prefix + "{}".format(line)
    dataframe = dataframe.rename(columns = rename_dict)
    return dataframe

In [3]:
# Import allele count data

collated_count_data = pd.DataFrame()
for gene in genes:
    for sub_population in sub_populations:
        allele_count_path = os.path.join(home_path,"Data", "SUB", "ALL_{}.{}.acount".format(gene, sub_population))
        if os.path.exists(allele_count_path):
            allele_count_df = pd.read_csv(allele_count_path, sep="\t")
            allele_count_df["REF_CTS"] = allele_count_df["OBS_CT"] - allele_count_df["ALT_CTS"]
            allele_count_df["GENE"] = gene
            allele_count_df["SUB_POP"] = sub_population
        collated_count_data = pd.concat([collated_count_data, allele_count_df]).drop(columns="OBS_CT")

collated_count_data

Unnamed: 0,#CHROM,ID,REF,ALT,ALT_CTS,REF_CTS,GENE,SUB_POP
0,13,rs1185300901,C,CT,0,188,COL4A1,ACB
1,13,rs552586867,C,G,0,188,COL4A1,ACB
2,13,rs59409892,C,G,19,169,COL4A1,ACB
3,13,rs535182970,G,C,1,187,COL4A1,ACB
4,13,rs56406633,A,G,1,187,COL4A1,ACB
...,...,...,...,...,...,...,...,...
158,21,rs981058453,C,T,0,104,OLIG2,ASW
159,21,rs182058038,T,A,1,103,OLIG2,ASW
160,21,rs7278343,A,G,21,83,OLIG2,ASW
161,21,rs151281307,AT,A,1,103,OLIG2,ASW


## Format allele count data in format suitable for Fisher's test

In [4]:
fishers_data = collated_count_data[collated_count_data.ID != "."] # Drop ambiguous variant IDs
fishers_data = fishers_data.drop_duplicates(["#CHROM","ID","REF","ALT","SUB_POP"]) # Drop duplicate entries
fishers_data = fishers_data.pivot(index=["#CHROM","ID","REF","ALT"], columns="SUB_POP", values=["ALT_CTS","REF_CTS"]) # Pivot data

# Separate alternate and reference count data into different dataframes to facilate renaming of count columns appropriately
fishers_data_alt = fishers_data[['ALT_CTS']].droplevel(level=0, axis=1).reset_index() 
fishers_data_ref = fishers_data[['REF_CTS']].droplevel(level=0, axis=1).reset_index()

fishers_data_alt = add_prefix_dataframe_col_names(fishers_data_alt, sub_populations,"ALT_CT_")
fishers_data_ref = add_prefix_dataframe_col_names(fishers_data_ref, sub_populations,"REF_CT_")

# Merge renamed alternate and reference count data
fishers_data_renamed = fishers_data_alt.merge(fishers_data_ref, on=["#CHROM","ID","REF","ALT"])
print(fishers_data_renamed)

SUB_POP  #CHROM            ID REF ALT  ALT_CT_ACB  ALT_CT_ASW  \
0             1  rs1000680496   G   A           0           0   
1             1  rs1001911383   C   T           0           0   
2             1  rs1002151986   A   C           0           0   
3             1  rs1002155376   G   T           0           0   
4             1  rs1002228547   G   C           1           0   
...         ...           ...  ..  ..         ...         ...   
20302        21   rs972037300   C   G           0           0   
20303        21   rs975220347   C   T           0           0   
20304        21   rs981058453   C   T           0           0   
20305        21   rs997790265   C  CG           0           0   
20306        21     rs9982080   G   T          29          16   

SUB_POP  ALT_CT_BantuKenya  ALT_CT_BantuSouthAfrica  ALT_CT_BiakaPygmy  \
0                        0                        0                  0   
1                        0                        0                  0 

## Perform Fisher's Exact test

In [5]:
# Generate all possible combinations of populations for comparison
combinations = list(itertools.combinations(sub_populations,2))
combinations


[('ACB', 'GWD'),
 ('ACB', 'ESN'),
 ('ACB', 'MSL'),
 ('ACB', 'MbutiPygmy'),
 ('ACB', 'BiakaPygmy'),
 ('ACB', 'Mandenka'),
 ('ACB', 'Yoruba'),
 ('ACB', 'San'),
 ('ACB', 'BantuSouthAfrica'),
 ('ACB', 'BantuKenya'),
 ('ACB', 'YRI'),
 ('ACB', 'LWK'),
 ('ACB', 'ASW'),
 ('GWD', 'ESN'),
 ('GWD', 'MSL'),
 ('GWD', 'MbutiPygmy'),
 ('GWD', 'BiakaPygmy'),
 ('GWD', 'Mandenka'),
 ('GWD', 'Yoruba'),
 ('GWD', 'San'),
 ('GWD', 'BantuSouthAfrica'),
 ('GWD', 'BantuKenya'),
 ('GWD', 'YRI'),
 ('GWD', 'LWK'),
 ('GWD', 'ASW'),
 ('ESN', 'MSL'),
 ('ESN', 'MbutiPygmy'),
 ('ESN', 'BiakaPygmy'),
 ('ESN', 'Mandenka'),
 ('ESN', 'Yoruba'),
 ('ESN', 'San'),
 ('ESN', 'BantuSouthAfrica'),
 ('ESN', 'BantuKenya'),
 ('ESN', 'YRI'),
 ('ESN', 'LWK'),
 ('ESN', 'ASW'),
 ('MSL', 'MbutiPygmy'),
 ('MSL', 'BiakaPygmy'),
 ('MSL', 'Mandenka'),
 ('MSL', 'Yoruba'),
 ('MSL', 'San'),
 ('MSL', 'BantuSouthAfrica'),
 ('MSL', 'BantuKenya'),
 ('MSL', 'YRI'),
 ('MSL', 'LWK'),
 ('MSL', 'ASW'),
 ('MbutiPygmy', 'BiakaPygmy'),
 ('MbutiPygmy', 'Ma

In [6]:
oddsratio, pvalue = fisher_exact([[186,228],[0,0]])
print (oddsratio, pvalue)

nan 1.0


In [7]:
# Calculate fisher's exact odds ratios and p-values for each variant for the different population combinations

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

fishers_results = pd.DataFrame()
for index, row in fishers_data_renamed.iterrows():
    fisher_variant_combination_results = pd.DataFrame()
    for combination in combinations:
        first_pop = combination[0]
        second_pop = combination[1]
        alt_1 = row["ALT_CT_{}".format(first_pop)]
        ref_1 = row["REF_CT_{}".format(first_pop)]
        alt_2 = row["ALT_CT_{}".format(second_pop)]
        ref_2 = row["REF_CT_{}".format(second_pop)]
        oddsratio, pvalue = fisher_exact([[alt_1,alt_2],[ref_1,ref_2]])
        fisher_variant_combination_results["CHROM"] = [row["#CHROM"]]
        fisher_variant_combination_results["ID"] = row["ID"]
        fisher_variant_combination_results["REF"] = row["REF"]
        fisher_variant_combination_results["ALT"] = row["ALT"]
        fisher_variant_combination_results["PVALUE_{}_{}".format(first_pop, second_pop)] = pvalue
        fisher_variant_combination_results["OR_{}_{}".format(first_pop, second_pop)] = oddsratio
    fishers_results = pd.concat([fishers_results,fisher_variant_combination_results])


In [8]:
# Save Fisher's test results to CSV file

fishers_results.reset_index(drop=True).to_csv(os.path.join(home_path, "Data", "FishersExactTest", "Fishers_results_subpopulations.csv"))

In [9]:
# Correct for multiple testing by controlling false discovery rate (FDR) using Benjamini and Hochberg

# Generate a list with column names for all subpopulation p-values in the fishers_results dataframe
p_value_combinations_list = []
for combination in combinations:
    first_pop = combination[0]
    second_pop = combination[1]
    p_value_combinations = "PVALUE_{}_{}".format(first_pop, second_pop)
    p_value_combinations_list.append(p_value_combinations)

# Method 1: Segregate the data by subpopulation and correct for multiple testing within a subpopulation

corrected_pvalues = pd.DataFrame()
for item in p_value_combinations_list:
    multipletests_input = fishers_results[item]
    multipletests_results = multipletests(multipletests_input, alpha=0.05, method="fdr_bh")[1]
    corrected_pvalues["{}_FDR".format(item)] = multipletests_results

corrected_pvalues = pd.concat([fishers_results.reset_index(drop=True), corrected_pvalues.reset_index(drop=True)], axis=1)
corrected_pvalues.to_csv(os.path.join(home_path, "Data", "FishersExactTest", "Fishers_multipletestcorrection_fdrbh_pd.csv"))

# Method 2: Correct for multiple testing for the entire dataset without segregating the data per subpopulation

fishers_results_melt = pd.melt(fishers_results, id_vars=["CHROM","ID","REF","ALT"], value_vars=p_value_combinations_list)
multipletests_input = fishers_results_melt[["value"]].values.flatten().tolist()
multipletests_results = multipletests(multipletests_input, alpha=0.05, method="fdr_bh")
multipletests_results_df = pd.DataFrame(multipletests_results[1])
multipletests_results_info = pd.concat([fishers_results_melt.reset_index(drop=True), multipletests_results_df.reset_index(drop=True)], axis=1)

multipletests_results_info.to_csv(os.path.join(home_path, "Data", "FishersExactTest", "Fishers_multipletestcorrection_fdrbh_wd.csv"))