#  Regression analysis

* **Project:** ADRD Genetic Diversity in Biobanks
* **Version:** Python/3.10
* **Last Updated:** 28-October-2024

## Notebook Overview
Using logistic regression to analyze the protective and conditional models, Create covariate files for r2 and interaction models

## Variables used 
`${ancestry}` = EUR, AFR, AMR, EAS, SAS, AAC, MDE, AJ, FIN, CAS, CAH

`chr${}`:Position:A1:A2= Chromosom's number, position, reference and alternative alleles

`${chr}` = 2, 19, 7, 21, 4, 11, 14, 20, 15, 16, 17

`${APOE}`= e4 carriers, e4e4 carriers, e3e3 carriers

`${APOE genotype}`= e3/e4, e4/e4, e1/e4, e3/e3

### Using logistic regression to analyze the protective model

#### Extract variants and generate VCF files

In [None]:
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

#### Create a covariate file including principal components (PCs)

##### Generate PCs

In [None]:
import pandas as pd

# List of file names to merge
file_names = [
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_AAC_PCA.eigenvec",
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_AFR_PCA.eigenvec",
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_AJ_PCA.eigenvec",
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_AMR_PCA.eigenvec",
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_CAH_PCA.eigenvec",
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_CAS_PCA.eigenvec",
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_EAS_PCA.eigenvec",
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_EUR_FIN_MDE_PCA.eigenvec",
    "{WORK_DIR}/QC_unrelated_FILTERED.merged_biallelic_SAS_PCA.eigenvec"
]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each file and read it, ignoring the header
for file_name in file_names:
    df = pd.read_csv(file_name, delim_whitespace=True, comment='#', header=None,
                     names=["#FID", "IID", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10"])
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged DataFrame to a file
merged_df.to_csv('All_PCS_ADSP_Correct.eigenvec', sep='\t', index=False)

# Display the first few rows of the merged DataFrame
print(merged_df.head())


##### Add PCs to the covariate file

In [None]:
import pandas as pd

# Load the covars_for_QC.txt file
covars_df = pd.read_csv("{WORK_DIR}/covars_for_QC.txt", sep="\t")

# Load the All_PCS_ADSP_Correct.eigenvec file with specified column names
pca_df = pd.read_csv("{WORK_DIR}/All_PCS_ADSP_Correct.eigenvec", sep="\t", names=["#FID", "IID"] + [f"PC{i}" for i in range(1, pca_df.shape[1]-1)])

# Rename '#FID' to 'FID' in the pca_df
pca_df.rename(columns={"#FID": "FID"}, inplace=True)

# Merge the two dataframes on 'FID' and 'IID'
merged_df = pd.merge(covars_df, pca_df, on=['FID', 'IID'], how='inner')

# Save the merged dataframe to a new file
merged_df.to_csv("{WORK_DIR}/covars_alldata_PCA.txt", sep="\t", index=False)

In [None]:
import pandas as pd

# Load the covars_for_QC.txt file
covars_df = pd.read_csv("{WORK_DIR}/covars_for_QC.txt", sep="\t")

# Replace 'NA' with '-999' in the 'AGE' and 'RACE' columns
covars_df['AGE'] = covars_df['AGE'].fillna(-999)
covars_df['RACE'] = covars_df['RACE'].fillna(-999)

# Load the All_PCS_ADSP_Correct.eigenvec file to determine the number of columns
pca_temp_df = pd.read_csv("{WORK_DIR}/All_PCS_ADSP_Correct.eigenvec", sep="\t")
num_pcs = pca_temp_df.shape[1] - 2  # Subtract 2 for the first two columns '#FID' and 'IID'

# Load the All_PCS_ADSP_Correct.eigenvec file with specified column names
pca_df = pd.read_csv("{WORK_DIR}/All_PCS_ADSP_Correct.eigenvec", sep="\t", names=["#FID", "IID"] + [f"PC{i}" for i in range(1, num_pcs + 1)])

# Rename '#FID' to 'FID' in the pca_df
pca_df.rename(columns={"#FID": "FID"}, inplace=True)

# Merge the two dataframes on 'FID' and 'IID'
merged_df = pd.merge(covars_df, pca_df, on=['FID', 'IID'], how='inner')

# Save the merged dataframe to a new file
merged_df.to_csv("{WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA.txt", sep="\t", index=False)

#### Create PHENO files for each ancestry

#### Logistic regression for protective model

#### Preparing a table for the results

In [None]:
import pandas as pd
import os

# Define the directory path
dir_path = "{WORK_DIR}"

# Define the list of files and their corresponding ancestry codes
files_and_ancestries = [
    ("Logistic_FID_IID_PHENO_case_controls_AAC_All_variants.txt", "AAC"),
    ("Logistic_FID_IID_PHENO_case_controls_AFR_All_variants.txt", "AFR"),
    ("Logistic_FID_IID_PHENO_case_controls_AMR_All_variants.txt", "AMR"),
    ("Logistic_FID_IID_PHENO_case_controls_AJ_All_variants.txt", "AJ"),
    ("Logistic_FID_IID_PHENO_case_controls_EUR_All_variants.txt", "EUR"),
    ("Logistic_FID_IID_PHENO_case_controls_CAS_All_variants.txt", "CAS"),
    ("Logistic_FID_IID_PHENO_case_controls_SAS_All_variants.txt", "SAS"),
    ("Logistic_FID_IID_PHENO_case_controls_MDE_All_variants.txt", "MDE"),
    ("Logistic_FID_IID_PHENO_case_controls_EAS_All_variants.txt", "EAS"),
    ("Logistic_FID_IID_PHENO_case_controls_FIN_All_variants.txt", "FIN"),
    ("Logistic_FID_IID_PHENO_case_controls_CAH_All_variants.txt", "CAH"),
    
]

# Process each file
for file_name, ancestry in files_and_ancestries:
    file_path = os.path.join(dir_path, file_name)
    
    # Load the data
    df = pd.read_csv(file_path, sep='\t')

    # Add the new column with the corresponding ancestry code
    df['ancestry'] = ancestry

    # Save the updated DataFrame to a new file
    output_file_name = file_name.replace(".txt", "_with_ancestry.txt")
    output_file_path = os.path.join(dir_path, output_file_name)
    df.to_csv(output_file_path, sep='\t', index=False)

    print(f"Updated file saved to {output_file_path}")

In [None]:
import pandas as pd
import os

# Define the directory path
dir_path = f"{WORK_DIR}"

# List of file paths to be combined
file_paths = [
    "Logistic_FID_IID_PHENO_case_controls_EAS_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_SAS_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_CAS_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_EUR_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_AMR_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_AAC_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_AFR_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_AJ_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_MDE_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_CAH_All_variants_with_ancestry.txt",
    "Logistic_FID_IID_PHENO_case_controls_FIN_All_variants_with_ancestry.txt",
]

# Initialize an empty list to hold the DataFrames
df_list = []

# Read each file and append the DataFrame to the list
for file_name in file_paths:
    file_path = os.path.join(dir_path, file_name)
    df = pd.read_csv(file_path, sep='\t')
    df_list.append(df)

# Concatenate all DataFrames in the list
combined_df = pd.concat(df_list, ignore_index=True)

# Save the combined DataFrame to a new file
output_file_path = os.path.join(dir_path, "Combined_Logistic_with_ancestry.hybrid")
combined_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Combined data saved to {output_file_path}")


In [None]:
import pandas as pd

# Load the combined data file
file_path = f"{WORK_DIR}/Combined_Logistic_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t')

# Filter the rows where the TEST column has the value "ADD"
filtered_df = df[df['TEST'] == 'ADD']

# Save the filtered DataFrame to a new file
output_file_path = f"{WORK_DIR}/Filtered_Combined_Logistic_with_ancestry.hybrid"
filtered_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Filtered data saved to {output_file_path}")

In [None]:
import pandas as pd

# Load the filtered data file
file_path = f"{WORK_DIR}/Filtered_Combined_Logistic_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t')

# Select only the required columns
columns_to_keep = ['#CHROM', 'POS', 'REF', 'ALT', 'A1', 'P', 'OR', 'L95', 'U95', 'ancestry']
filtered_df = df[columns_to_keep]

# Save the filtered DataFrame to a new file
output_file_path = f"{WORK_DIR}/Selected_Columns_Filtered_Combined_Logistic_with_ancestry.hybrid"
filtered_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Data with selected columns saved to {output_file_path}")

In [None]:
import pandas as pd

# Load the data file
file_path = f"{WORK_DIR}/Selected_Columns_Filtered_Combined_Logistic_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t')

# Define the desired order of ancestries
ancestry_order = ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'MDE', 'AJ', 'FIN', 'AAC', 'CAS', 'CAH']

# Initialize a list to hold the rows for the new table
combined_data = []

# Group the data by variation (defined by #CHROM, POS, REF, ALT)
grouped = df.groupby(['#CHROM', 'POS', 'REF', 'ALT'])

# Iterate over each group and format the data for the combined table
for (chrom, pos, ref, alt), group in grouped:
    # Append the variation row
    combined_data.append([f"{chrom}:{pos} {ref}>{alt}", "", "", ""])
    
    # Append the header row for ancestries
    combined_data.append(["Ancestry", "A1", "P", "OR (L95_U95)"])
    
    # Create a dictionary of ancestries and their data
    ancestry_dict = {row['ancestry']: [row['A1'], row['P'], f"{row['OR']} ({row['L95']}_{row['U95']})"] for _, row in group.iterrows()}
    
    # Append the data rows for each ancestry in the specified order if it exists
    for ancestry in ancestry_order:
        if ancestry in ancestry_dict:
            combined_data.append([ancestry] + ancestry_dict[ancestry])

    # Add an empty row for separation between variations
    combined_data.append(["", "", "", ""])

# Convert the combined data into a DataFrame
combined_df = pd.DataFrame(combined_data)

# Save the combined DataFrame to a new file
output_file_path = "Combined_Variations_Table.tsv"
combined_df.to_csv(output_file_path, sep='\t', header=False, index=False)

print(f"Combined variations table saved to {output_file_path}")

### Using logistic regression to analyze the Conditional models

#### Create three covariate files that include APOE status for e4 carriers, e4/e4 carriers, and e3/e3 carriers

In [None]:
import pandas as pd
import os

# File paths
input_dir = "{WORK_DIR}" 
output_dir = "{WORK_DIR}" 

# List of files to process (Check the 00_ADSP notebook for generating these files)
files = [
    "adsp_vars_aac_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_afr_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_aj_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_amr_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_cah_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_cas_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_eas_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_eur_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_fin_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_mde_apoe_unrelated_recode_test.APOE_GENOTYPES.csv",
    "adsp_vars_sas_apoe_unrelated_recode_test.APOE_GENOTYPES.csv"
]

# Combined dataframe to store all results
combined_df = pd.DataFrame()

# Loop through each file and process
for file in files:
    file_path = os.path.join(input_dir, file)
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Filter the rows based on the APOE_GENOTYPE
    filtered_df = df[df['APOE_GENOTYPE'].isin(['{APOE genotype}'])]  # for e4 carriers: {APOE genotype} ='e3/e4', 'e4/e4', 'e1/e4' # for e4e4 carriers: {APOE genotype}='e4/e4' # for e3e3 carriers: {APOE genotype}='e3e3'
    
    # Keep only FID, IID, and APOE_GENOTYPE columns
    filtered_df = filtered_df[['FID', 'IID', 'APOE_GENOTYPE']]
    
    # Save the filtered results for each ancestry
    ancestry = file.split('_')[2]  # Extract ancestry from the file name (e.g., 'aac', 'afr')
    output_file = os.path.join(output_dir, f"APOE_filtered_genotypes_{APOE}_{ancestry}.csv")
    filtered_df.to_csv(output_file, index=False)
    
    # Add ancestry column to the filtered dataframe for combination later
    filtered_df['Ancestry'] = ancestry
    
    # Append to the combined dataframe
    combined_df = pd.concat([combined_df, filtered_df], ignore_index=True)

# Save the combined dataframe to a new file
combined_output_file = os.path.join(output_dir, "APOE_filtered_genotypes_combined_{APOE}.csv")
combined_df.to_csv(combined_output_file, index=False)

print(f"Filtered results for each ancestry are saved separately, and the combined file is saved as {combined_output_file}")


In [None]:
import pandas as pd

# File path
input_file = f"{WORK_DIR}/APOE_filtered_genotypes_combined_{APOE}.csv"

# Read the CSV file
df = pd.read_csv(input_file)

# Group by Ancestry and APOE_GENOTYPE, and count the occurrences
counts = df.groupby(['Ancestry', 'APOE_GENOTYPE']).size().reset_index(name='Count')

# Display the result
print(counts)

# Save the result to a CSV file
output_file = f"{WORK_DIR}/APOE_genotype_counts_{APOE}.csv"
counts.to_csv(output_file, index=False)
print(f"Counts saved to {output_file}")


In [None]:
import pandas as pd

# File paths
covar_file = f"{WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA.txt"
apoe_file = f"{WORK_DIR}/APOE_filtered_genotypes_combined_{APOE}.csv"
output_file = f"{WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA_{APOE}.txt"

# Load the covariate data
covar_df = pd.read_csv(covar_file, sep="\t")

# Load the APOE genotype data
apoe_df = pd.read_csv(apoe_file)

# Merge the two dataframes on FID and IID (inner merge to keep only matching records)
merged_df = covar_df.merge(apoe_df[['FID', 'IID']], on=['FID', 'IID'], how='left', indicator=True)

# Create the APOE_STATUS column: 1 if the record exists in the APOE genotype file, otherwise 0 for e4 carriers. 
# Create the APOE_STATUS column: 2 if the record exists in the APOE genotype file, otherwise 0 for e4e4 carriers. 
# Create the APOE_STATUS column: 2 if the record exists in the APOE genotype file, otherwise 0 for e3e3 carriers.
merged_df['APOE_STATUS'] = merged_df['_merge'].apply(lambda x: 1 if x == 'both' else 0)

# Drop the _merge column (it was used to track matching records)
merged_df.drop(columns=['_merge'], inplace=True)

# Save the result to the output file
merged_df.to_csv(output_file, sep='\t', index=False)

print(f"File saved as {output_file}")


#### Logistic regression for conditional model

#### Preparing tables for the results

In [None]:
import pandas as pd
import os

# Define the directory path
dir_path = f"{WORK_DIR}"

# Define the list of files and their corresponding ancestry codes
files_and_ancestries = [
    (f"Logistic_FID_IID_PHENO_case_controls_AAC_All_variants_{APOE}.txt", "AAC"),
    (f"Logistic_FID_IID_PHENO_case_controls_AFR_All_variants_{APOE}.txt", "AFR"),
    (f"Logistic_FID_IID_PHENO_case_controls_AMR_All_variants_{APOE}.txt", "AMR"),
    (f"Logistic_FID_IID_PHENO_case_controls_AJ_All_variants_{APOE}.txt", "AJ"),
    (f"Logistic_FID_IID_PHENO_case_controls_EUR_All_variants_{APOE}.txt", "EUR"),
    (f"Logistic_FID_IID_PHENO_case_controls_CAS_All_variants_{APOE}.txt", "CAS"),
    (f"Logistic_FID_IID_PHENO_case_controls_SAS_All_variants_{APOE}.txt", "SAS"),
    (f"Logistic_FID_IID_PHENO_case_controls_MDE_All_variants_{APOE}.txt", "MDE"),
    (f"Logistic_FID_IID_PHENO_case_controls_EAS_All_variants_{APOE}.txt", "EAS"),
    (f"Logistic_FID_IID_PHENO_case_controls_FIN_All_variants_{APOE}.txt", "FIN"),
    (f"Logistic_FID_IID_PHENO_case_controls_CAH_All_variants_{APOE}.txt", "CAH"),
    
]

# Process each file
for file_name, ancestry in files_and_ancestries:
    file_path = os.path.join(dir_path, file_name)
    
    # Load the data
    df = pd.read_csv(file_path, sep='\t')

    # Add the new column with the corresponding ancestry code
    df['ancestry'] = ancestry

    # Save the updated DataFrame to a new file
    output_file_name = file_name.replace(".txt", "_with_ancestry.txt")
    output_file_path = os.path.join(dir_path, output_file_name)
    df.to_csv(output_file_path, sep='\t', index=False)

    print(f"Updated file saved to {output_file_path}")


In [None]:
import pandas as pd
import os

# Define the directory path
dir_path = f"{WORK_DIR}"

# List of file paths to be combined
file_paths = [
    f"Logistic_FID_IID_PHENO_case_controls_EAS_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_SAS_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_CAS_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_EUR_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_AMR_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_AAC_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_AFR_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_AJ_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_MDE_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_CAH_All_variants_{APOE}_with_ancestry.txt",
    f"Logistic_FID_IID_PHENO_case_controls_FIN_All_variants_{APOE}_with_ancestry.txt",
]

# Initialize an empty list to hold the DataFrames
df_list = []

# Read each file and append the DataFrame to the list
for file_name in file_paths:
    file_path = os.path.join(dir_path, file_name)
    df = pd.read_csv(file_path, sep='\t')
    df_list.append(df)

# Concatenate all DataFrames in the list
combined_df = pd.concat(df_list, ignore_index=True)

# Save the combined DataFrame to a new file
output_file_path = os.path.join(dir_path, f"Combined_Logistic_{APOE}_with_ancestry.hybrid")
combined_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Combined data saved to {output_file_path}")

In [None]:
import pandas as pd

# Load the combined data file
file_path = f"{WORK_DIR}/Combined_Logistic_{APOE}_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t')

# Filter the rows where the TEST column has the value "ADD"
filtered_df = df[df['TEST'] == 'ADD']

# Save the filtered DataFrame to a new file
output_file_path = f"{WORK_DIR}/Filtered_Combined_Logistic_{APOE}_with_ancestry.hybrid"
filtered_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Filtered data saved to {output_file_path}")

In [None]:
import pandas as pd

# Load the filtered data file
file_path = f"{WORK_DIR}/Filtered_Combined_Logistic_{APOE}_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t')

# Select only the required columns
columns_to_keep = ['#CHROM', 'POS', 'REF', 'ALT', 'A1', 'P', 'OR', 'L95', 'U95', 'ancestry']
filtered_df = df[columns_to_keep]

# Save the filtered DataFrame to a new file
output_file_path = f"{WORK_DIR}/Selected_Columns_Filtered_Combined_Logistic_{APOE}_with_ancestry.hybrid"
filtered_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Data with selected columns saved to {output_file_path}")

In [None]:
import pandas as pd

# Load the data file
file_path = f"{WORK_DIR}/Selected_Columns_Filtered_Combined_Logistic_{APOE}_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t')

# Define the desired order of ancestries
ancestry_order = ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'MDE', 'AJ', 'FIN', 'AAC', 'CAS', 'CAH']

# Initialize a list to hold the rows for the new table
combined_data = []

# Group the data by variation (defined by #CHROM, POS, REF, ALT)
grouped = df.groupby(['#CHROM', 'POS', 'REF', 'ALT'])

# Iterate over each group and format the data for the combined table
for (chrom, pos, ref, alt), group in grouped:
    # Append the variation row
    combined_data.append([f"{chrom}:{pos} {ref}>{alt}", "", "", ""])
    
    # Append the header row for ancestries
    combined_data.append(["Ancestry", "A1", "P", "OR (L95_U95)"])
    
    # Create a dictionary of ancestries and their data
    ancestry_dict = {row['ancestry']: [row['A1'], row['P'], f"{row['OR']} ({row['L95']}_{row['U95']})"] for _, row in group.iterrows()}
    
    # Append the data rows for each ancestry in the specified order if it exists
    for ancestry in ancestry_order:
        if ancestry in ancestry_dict:
            combined_data.append([ancestry] + ancestry_dict[ancestry])

    # Add an empty row for separation between variations
    combined_data.append(["", "", "", ""])

# Convert the combined data into a DataFrame
combined_df = pd.DataFrame(combined_data)

# Save the combined DataFrame to a new file
output_file_path = f"Combined_Variations_Table_{APOE}.tsv"
combined_df.to_csv(output_file_path, sep='\t', header=False, index=False)

print(f"Combined variations table saved to {output_file_path}")


#### Create three covariate files that include APOE status (e4 carriers, e4/e4 carriers, and e3/e3 carriers) and protective/resilient variants

In [None]:
import pandas as pd
import vcf  

# Load the covariate file
covar_file = f'{WORK_DIR}/covars_alldata_with999forAGRandRACE_PCAf_{APOE}.txt'
covar_df = pd.read_csv(covar_file, sep='\t')

# Set FID as index for easy matching
covar_df.set_index('FID', inplace=True)

# List of VCF files for all chromosomes
vcf_files = [
    f'{WORK_DIR}/vars_chr2_vcf.vcf',
    f'{WORK_DIR}/vars_chr19_vcf.vcf',
    f'{WORK_DIR}/vars_chr7_vcf.vcf',
    f'{WORK_DIR}/vars_chr21_vcf.vcf',
    f'{WORK_DIR}/vars_chr4_vcf.vcf',
    f'{WORK_DIR}/vars_chr11_vcf.vcf',
    f'{WORK_DIR}/vars_chr14_vcf.vcf',
    f'{WORK_DIR}/vars_chr20_vcf.vcf',
    f'{WORK_DIR}/vars_chr15_vcf.vcf',
    f'{WORK_DIR}/vars_chr16_vcf.vcf',
    f'{WORK_DIR}/vars_chr17_vcf.vcf'
]

# Initialize a dictionary to store genotype data
genotype_data = {}

# Function to extract genotypes from a VCF file
def extract_genotypes(vcf_file):
    vcf_reader = vcf.Reader(open(vcf_file, 'r'))
    for record in vcf_reader:
        variant_id = record.ID
        if variant_id not in genotype_data:
            genotype_data[variant_id] = {}

        for sample in record.samples:
            fid = sample.sample  # Sample ID, which matches FID
            genotype = sample['GT']  # Genotype (e.g., 0/1, 1/1, etc.)
            genotype_data[variant_id][fid] = genotype

# Process each VCF file and extract genotypes
for vcf_file in vcf_files:
    extract_genotypes(vcf_file)

# Add genotype data to the covariate DataFrame
for variant_id, genotypes in genotype_data.items():
    # Create a new column for each variant in the covariate dataframe
    covar_df[variant_id] = covar_df.index.map(genotypes)

# Save the updated DataFrame to a new file
output_file = f'{WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA_{APOE}_allvariants.txt'
covar_df.to_csv(output_file, sep='\t')


In [None]:
import pandas as pd

# Load the covariate file
covar_file = f'{WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA_{APOE}_allvariants.txt'
covar_df = pd.read_csv(covar_file, sep='\t')

# Function to map genotype codes to integers
def map_genotype(genotype):
    if genotype == '0/0' or genotype == './.':
        return 0
    elif genotype == '0/1':
        return 1
    elif genotype == '1/1':
        return 2
    else:
        return None  # For any unknown genotype

# List of genotype columns to transform
genotype_columns = [chr{}:position:A1:A2]

# Apply the mapping function to the genotype columns
for column in genotype_columns:
    if column in covar_df.columns:  # Check if the column exists in the DataFrame
        covar_df[column] = covar_df[column].apply(map_genotype)

# Save the updated DataFrame to a new file
output_file = f'{WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA_{APOE}_allvariants_mapped.txt'
covar_df.to_csv(output_file, sep='\t', index=False)

print("Genotype mapping completed and saved to:", output_file)


#### Separate covariate files based on ancestry

In [None]:
import os
import pandas as pd

# Path to the covariate file
covar_file = f'{WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA_{APOE}_allvariants_mapped.txt'
covar_df = pd.read_csv(covar_file, sep='\t')

# List of ancestries
ancestries = ["EUR", "AFR", "AMR", "EAS", "SAS", "AAC", "MDE", "AJ", "FIN", "CAS", "CAH"]

# Create a new directory to save the results
output_directory = f'{WORK_DIR}'
os.makedirs(output_directory, exist_ok=True)

# Function to filter based on ancestry
def filter_by_ancestry(ancestry):
    # Construct the filename for the specified ancestry
    fam_file = f'{WORK_DIR}/FID_IID_PHENO_{ancestry}.fam'
    # Read the FID from the fam file
    ancestry_df = pd.read_csv(fam_file, sep=' ', header=None, names=['FID', 'IID', 'PHENO1'])
    
    # Filter the covariate DataFrame to keep only the FIDs present in the ancestry DataFrame
    filtered_df = covar_df[covar_df['FID'].isin(ancestry_df['FID'])]
    
    # Define the output file name in the new directory
    output_file = f'{output_directory}/covars_alldata_with999forAGRandRACE_PCA_{APOE}_allvariants_mapped_{ancestry}.txt'
    
    # Save the filtered DataFrame to a new file
    filtered_df.to_csv(output_file, sep='\t', index=False)
    print(f"Filtered file saved for {ancestry}: {output_file}")

# Loop through each ancestry and filter the data
for ancestry in ancestries:
    filter_by_ancestry(ancestry)
