# Data Preparation

## Combine in-house Recent African and ALFA data for comparison purposes

Only variants with rsIDs are included

Change working directory

In [22]:
import os

os.chdir(
    r"C:\Users\User\Desktop\Megan\MSC2\Results\5._Posthoc_analysis\Pipeline_GnomAD_SAHGP_14032023\Genomic_data_analysis\Analysis"
)

Import modules and packages

In [23]:
import sys

sys.path.append(
    r"C:\Users\User\Desktop\Megan\MSC2\Results\5._Posthoc_analysis\Pipeline_GnomAD_SAHGP_14032023\Genomic_data_analysis"
)
import pandas as pd
import numpy as np
import Utils.constants as constants
import Utils.functions as functions

Import in-house data that has been formatted accordingly

In [24]:
ih_grouped_data = pd.read_csv(
    os.path.join(
        constants.HOME_PATH,
        "Data",
        "Processed",
        "IH_allele_counts_fishers.csv",
    )
).drop(columns="Unnamed: 0")

In [25]:
ih_grouped_data.head(5)

Unnamed: 0,UNIQUE_VARIANT_NAME,ID,REF,ALT,GENE,POS,ALT_CT_IH_ACB,ALT_CT_IH_ASW,ALT_CT_IH_African,ALT_CT_IH_CA,...,REF_CT_IH_SA,REF_CT_IH_WA,CORR_REF_CT_IH_ACB,CORR_REF_CT_IH_ASW,CORR_REF_CT_IH_African,CORR_REF_CT_IH_CA,CORR_REF_CT_IH_EA,CORR_REF_CT_IH_Recent African,CORR_REF_CT_IH_SA,CORR_REF_CT_IH_WA
0,110148882_CT_C,chr13:110148882C-CT,C,CT,COL4A1,110148882,0,0,0,0,...,28,1282,228,142,2026,80,218,1656,76,1282
1,110148891_G_C,rs552586867,C,G,COL4A1,110148891,0,0,2,0,...,28,1280,228,142,2024,80,218,1654,76,1280
2,110148917_G_C,rs59409892,C,G,COL4A1,110148917,26,14,193,5,...,31,1153,202,128,1833,75,202,1503,73,1153
3,110148920_C_G,rs535182970,G,C,COL4A1,110148920,1,0,1,0,...,28,1282,227,142,2025,80,218,1656,76,1282
4,110148959_G_A,rs56406633,A,G,COL4A1,110148959,1,0,1,0,...,28,1282,227,142,2025,80,218,1656,76,1282


Import alfa data

In [26]:
alfa_grouped_data = pd.read_csv(
    os.path.join(
        constants.HOME_PATH,
        "Data",
        "Processed",
        "ALFA_allele_counts_b.csv",
    )
).drop(columns="Unnamed: 0")

In [27]:
alfa_grouped_data.head(5)

Unnamed: 0,variant_id,reference_allele,alternate_allele,ALT_CT_ALFA_African,ALT_CT_ALFA_African American,ALT_CT_ALFA_African Others,ALT_CT_ALFA_Asian,ALT_CT_ALFA_East Asian,ALT_CT_ALFA_European,ALT_CT_ALFA_Latin American 1,...,REF_CT_ALFA_African Others,REF_CT_ALFA_Asian,REF_CT_ALFA_East Asian,REF_CT_ALFA_European,REF_CT_ALFA_Latin American 1,REF_CT_ALFA_Latin American 2,REF_CT_ALFA_Other,REF_CT_ALFA_Other Asian,REF_CT_ALFA_South Asian,REF_CT_ALFA_Total
0,rs1000343,C,T,695.0,671.0,24.0,0.0,0.0,49.0,5.0,...,296.0,622.0,490.0,109377.0,673.0,2200.0,5457.0,132.0,184.0,127076.0
1,rs1000989,T,C,1435.0,1388.0,47.0,80.0,55.0,21489.0,123.0,...,135.0,158.0,109.0,37269.0,273.0,2052.0,1371.0,49.0,3283.0,48171.0
2,rs1000990,T,C,798.0,767.0,31.0,42.0,32.0,5355.0,40.0,...,83.0,70.0,54.0,8931.0,106.0,349.0,459.0,16.0,62.0,12125.0
3,rs1005573,C,T,3120.0,3009.0,111.0,63.0,35.0,10693.0,209.0,...,15.0,101.0,69.0,4955.0,87.0,956.0,474.0,32.0,31.0,7130.0
4,rs1007311,A,G,1970.0,1903.0,67.0,88.0,56.0,9154.0,61.0,...,55.0,80.0,56.0,11242.0,85.0,456.0,1538.0,24.0,55.0,15040.0


Merge in-house and ALFA data.

In [28]:
# Extract in-house variants with rsIDs. ALFA data is only available for variants with rsIDs.
ih_grouped_data_rsids = ih_grouped_data[ih_grouped_data.ID.str.contains("rs")]

# Merge data
ih_alfa_data = ih_grouped_data_rsids.merge(
    alfa_grouped_data,
    how="left",
    left_on=["ID", "REF", "ALT"],
    right_on=["variant_id", "reference_allele", "alternate_allele"],
).drop(columns=["variant_id", "reference_allele", "alternate_allele"])
ih_alfa_data = ih_alfa_data.replace(np.NAN, 0)

ih_alfa_data.head(5)

Unnamed: 0,UNIQUE_VARIANT_NAME,ID,REF,ALT,GENE,POS,ALT_CT_IH_ACB,ALT_CT_IH_ASW,ALT_CT_IH_African,ALT_CT_IH_CA,...,REF_CT_ALFA_African Others,REF_CT_ALFA_Asian,REF_CT_ALFA_East Asian,REF_CT_ALFA_European,REF_CT_ALFA_Latin American 1,REF_CT_ALFA_Latin American 2,REF_CT_ALFA_Other,REF_CT_ALFA_Other Asian,REF_CT_ALFA_South Asian,REF_CT_ALFA_Total
0,110148891_G_C,rs552586867,C,G,COL4A1,110148891,0,0,2,0,...,114.0,112.0,86.0,9690.0,146.0,610.0,496.0,26.0,98.0,14050.0
1,110148917_G_C,rs59409892,C,G,COL4A1,110148917,26,14,193,5,...,105.0,112.0,86.0,9824.0,145.0,605.0,674.0,26.0,98.0,14153.0
2,110148920_C_G,rs535182970,G,C,COL4A1,110148920,1,0,1,0,...,114.0,112.0,86.0,9690.0,146.0,610.0,496.0,26.0,98.0,14050.0
3,110148959_G_A,rs56406633,A,G,COL4A1,110148959,1,0,1,0,...,114.0,108.0,83.0,13842.0,141.0,588.0,671.0,25.0,95.0,18377.0
4,110148971_C_G,rs568536001,G,C,COL4A1,110148971,1,0,1,0,...,114.0,112.0,86.0,9690.0,146.0,610.0,496.0,26.0,98.0,14050.0


Save data to CSV file

In [29]:
ih_alfa_data.reset_index(drop=True).to_csv(
    os.path.join(
        constants.HOME_PATH,
        "Data",
        "Processed",
        "IH_ALFA_allele_counts.csv",
    )
)