# This code maps the inferred ancestries to the CLSA labels.

In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
df = pd.read_csv("/lustre06/project/6060121/CLSA_PheWeb_shared/Original/23ME002_UdeM_SGTaliun_Baseline/23ME002_UdeM_SGTaliun_Baseline_CoPv7_Qx_PA_BS.csv", low_memory = False)

# mapping to Table 3 (page 20 of the CLSA pdf)

In [3]:
import pandas as pd


# Mapping table as a dictionary
column_to_category = {
    "SDC_CULT_AR_COM": "Arab",
    "SDC_CULT_WA_COM": "Arab",
    "SDC_CULT_BL_COM": "Black",
    "SDC_CULT_ZH_COM": "East Asian",
    "SDC_CULT_JA_COM": "East Asian",
    "SDC_CULT_KO_COM": "East Asian",
    "SDC_CULT_LA_COM": "Latino",
    "SDC_CULT_DK_NA_COM": "Other",
    "SDC_CULT_OT_COM": "Other",
    "SDC_CULT_REFUSED_COM": "Other",
    "SDC_CULT_SA_COM": "South Asian",
    "SDC_CULT_FP_COM": "South-East Asian",
    "SDC_CULT_SE_COM": "South-East Asian",
    "SDC_CULT_WH_COM": "White"
}

# Ensure no NaN keys from ADM_GWAS3_COM
df = df.dropna(subset=['ADM_GWAS3_COM'])

# Function to get category labels for each row
def get_category_labels(row):
    categories = set()  # Use a set to avoid duplicate categories
    for col, category in column_to_category.items():
        if row[col] == 1:
            categories.add(category)
    return " and ".join(sorted(categories)) if categories else "None"

# Creating the dictionary
adm_gwas3_to_category = {row["ADM_GWAS3_COM"]: get_category_labels(row) for index, row in df.iterrows()}


In [5]:
from collections import Counter

# Assuming the dictionary adm_gwas3_to_category is already created correctly as per your requirements

# No need to flatten and split categories again, as each person with multiple ancestries 
# is already correctly labeled with " and ". We just need to count these occurrences directly.

# Directly count occurrences of each unique category/combination of categories
category_counts = Counter(adm_gwas3_to_category.values())

# Convert the counts to a DataFrame for display as a table
category_counts_df = pd.DataFrame(category_counts.items(), columns=["Category", "Count"])

category_counts_df.to_csv('/home/mikekaz/scratch/category_counts.csv', index=False)

# Display the table
print(category_counts_df)


                                     Category  Count
0                                       White  25180
1                                        Arab    105
2                                  East Asian    220
3                             Other and White    252
4                                       Other    167
5             East Asian and South-East Asian      6
6                                       Black    170
7                                      Latino     85
8                             Black and White     33
9                            South-East Asian     83
10                                South Asian    223
11           South Asian and South-East Asian      6
12                             Arab and White     14
13                      South Asian and White     11
14                       East Asian and White     14
15                           Latino and White     15
16                            Black and Other      3
17                  Black and Other and White 

In [22]:
import json

file_path = 'adm_gwas3_to_category.json'  # Define the file name

# Convert the dictionary to JSON and save it
with open(file_path, 'w') as json_file:
    json.dump(adm_gwas3_to_category, json_file)

print(f"File saved as {file_path}")


File saved as adm_gwas3_to_category.json


# mapping to Table 4 (page 22 of the CLSA pdf)

In [23]:
# Mapping from the old labels to the new final labels as per the provided table
label_mapping = {
    "White": "White",
    "South Asian": "South Asian",
    "East Asian": "East Asian",
    "Black": "Black",
    "Arab": "West Asian",
    "Other": "Other",
    "Latino": "Latin American",
    "South-East Asian": "Southeast Asian",
    "South Asian and White": "White and Asian",
    "East Asian and White": "White and Asian",
    "South-East Asian and White": "White and Asian",
    "Black and White": "White and Black"
}

# Function to map old labels to new final labels or "Mixed"
def map_to_final_label(old_label):
    return label_mapping.get(old_label, "Mixed")

# Creating the new dictionary with final labels
final_labels_dict = {key: map_to_final_label(value) for key, value in adm_gwas3_to_category.items()}

# final_labels_dict is the dictionary with the new mappings


In [25]:
from collections import Counter
import pandas as pd

# Count the occurrences of each final label
final_label_counts = Counter(final_labels_dict.values())

# Convert the counts to a DataFrame for display as a table
final_label_counts_df = pd.DataFrame(final_label_counts.items(), columns=['Final Label', 'Count'])

# Sort the DataFrame by the count in descending order
final_label_counts_df.sort_values('Count', ascending=False, inplace=True)

# Reset the index for a cleaner table
final_label_counts_df.reset_index(drop=True, inplace=True)

# Display the table
print(final_label_counts_df)


        Final Label  Count
0             White  25180
1             Mixed    329
2       South Asian    223
3        East Asian    220
4             Black    170
5             Other    167
6        West Asian    105
7    Latin American     85
8   Southeast Asian     83
9   White and Black     33
10  White and Asian     27


In [26]:
import json

# Assuming adm_gwas3_to_category is your dictionary
file_path = 'final_label_counts.json'  # Define the file name

# Convert the dictionary to JSON and save it
with open(file_path, 'w') as json_file:
    json.dump(final_labels_dict, json_file)

print(f"File saved as {file_path}")


File saved as final_label_counts.json


# filter for Z-score outliers (range: [-5,5])

In [1]:
import pandas as pd

# Load your DataFrame
df = pd.read_csv("/lustre06/project/6060121/CLSA_PheWeb_shared/Original/23ME002_UdeM_SGTaliun_Baseline/23ME002_UdeM_SGTaliun_Baseline_CoPv7_Qx_PA_BS.csv", low_memory=False)

# Read the IDs from the text file into a list, converting them to float to match the DataFrame column type
with open("/home/mikekaz/projects/rrg-vmooser/mikekaz/CLSA/Ancestry/FINAL/HGDP_1KG_ancestry_inference/output/Z_score_outliers_IDs.txt", "r") as file:
    outlier_ids = [float(line.strip()) for line in file]

# Filter the DataFrame to keep only rows where 'ADM_GWAS3_COM' matches the IDs read from the file
# df_filtered = df[df["ADM_GWAS3_COM"].isin(outlier_ids)]
df_filtered = df[~df["ADM_GWAS3_COM"].isin(outlier_ids)]



In [2]:
import pandas as pd

# Assuming df is your DataFrame

# Mapping table as a dictionary
column_to_category = {
    "SDC_CULT_AR_COM": "Arab",
    "SDC_CULT_WA_COM": "Arab",
    "SDC_CULT_BL_COM": "Black",
    "SDC_CULT_ZH_COM": "East Asian",
    "SDC_CULT_JA_COM": "East Asian",
    "SDC_CULT_KO_COM": "East Asian",
    "SDC_CULT_LA_COM": "Latino",
    "SDC_CULT_DK_NA_COM": "Other",
    "SDC_CULT_OT_COM": "Other",
    "SDC_CULT_REFUSED_COM": "Other",
    "SDC_CULT_SA_COM": "South Asian",
    "SDC_CULT_FP_COM": "South-East Asian",
    "SDC_CULT_SE_COM": "South-East Asian",
    "SDC_CULT_WH_COM": "White"
}

# Ensure no NaN keys from ADM_GWAS3_COM
df_filtered = df_filtered.dropna(subset=['ADM_GWAS3_COM'])

# Function to get category labels for each row
def get_category_labels(row):
    categories = set()  # Use a set to avoid duplicate categories
    for col, category in column_to_category.items():
        if row[col] == 1:
            categories.add(category)
    return " and ".join(sorted(categories)) if categories else "None"

# Creating the dictionary
adm_gwas3_to_category = {row["ADM_GWAS3_COM"]: get_category_labels(row) for index, row in df_filtered.iterrows()}


In [3]:
from collections import Counter

# Assuming the dictionary adm_gwas3_to_category is already created correctly as per your requirements

# No need to flatten and split categories again, as each person with multiple ancestries 
# is already correctly labeled with " and ". We just need to count these occurrences directly.

# Directly count occurrences of each unique category/combination of categories
category_counts = Counter(adm_gwas3_to_category.values())

# Convert the counts to a DataFrame for display as a table
category_counts_df = pd.DataFrame(category_counts.items(), columns=["Category", "Count"])

# Display the table
print(category_counts_df)


                                     Category  Count
0                                       White  25020
1                                        Arab    102
2                                  East Asian    217
3                             Other and White    250
4                                       Other    162
5             East Asian and South-East Asian      6
6                                       Black    170
7                                      Latino     84
8                             Black and White     33
9                            South-East Asian     83
10                                South Asian    223
11           South Asian and South-East Asian      6
12                             Arab and White     14
13                      South Asian and White     11
14                           Latino and White     15
15                            Black and Other      3
16                  Black and Other and White      6
17             East Asian and Other and White 

In [6]:
# Mapping from the old labels to the new final labels as per the provided table
label_mapping = {
    "White": "White",
    "South Asian": "South Asian",
    "East Asian": "East Asian",
    "Black": "Black",
    "Arab": "West Asian",
    "Other": "Other",
    "Latino": "Latin American",
    "South-East Asian": "Southeast Asian",
    "South Asian and White": "White and Asian",
    "East Asian and White": "White and Asian",
    "South-East Asian and White": "White and Asian",
    "Arab and White": "White and Asian",
    "Black and White": "White and Black"
}

# Function to map old labels to new final labels or "Mixed"
def map_to_final_label(old_label):
    return label_mapping.get(old_label, "Mixed")

# Creating the new dictionary with final labels
final_labels_dict = {key: map_to_final_label(value) for key, value in adm_gwas3_to_category.items()}

# final_labels_dict is the dictionary with the new mappings


In [7]:
from collections import Counter
import pandas as pd

# Count the occurrences of each final label
final_label_counts = Counter(final_labels_dict.values())

# Convert the counts to a DataFrame for display as a table
final_label_counts_df = pd.DataFrame(final_label_counts.items(), columns=['Final Label', 'Count'])

# Sort the DataFrame by the count in descending order
final_label_counts_df.sort_values('Count', ascending=False, inplace=True)

# Reset the index for a cleaner table
final_label_counts_df.reset_index(drop=True, inplace=True)

# Display the table
print(final_label_counts_df)


        Final Label  Count
0             White  25020
1             Mixed    311
2       South Asian    223
3        East Asian    217
4             Black    170
5             Other    162
6        West Asian    102
7    Latin American     84
8   Southeast Asian     83
9   White and Black     33
10  White and Asian     31
