In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.manifold import TSNE

In [None]:
file_path2 = 'hla_la_hibag_tag_compare.csv'
Sample_ID = pd.read_csv(file_path2)

In [None]:
HLA = Sample_ID

# calclate the concordance of HLA typing using different methods

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import matplotlib

# Set the global font to Arial for all text in the figures
matplotlib.rcParams['font.family'] = 'Arial'

def calculate_concordance(df, gene):
    df_filtered = df.dropna(subset=[f'hibag.{gene}.1', f'hibag.{gene}.2', f'hla.la.{gene}.1', f'hla.la.{gene}.2'])
    total_alleles = len(df_filtered) * 2  # Two alleles per sample
    matched_alleles = 0

    for _, row in df_filtered.iterrows():
        hibag_alleles = [row[f'hibag.{gene}.1'], row[f'hibag.{gene}.2']]
        hla_la_alleles = [row[f'hla.la.{gene}.1'], row[f'hla.la.{gene}.2']]

        # Count matches for each allele and count each match twice for the double consideration
        for ha in hibag_alleles:
            if ha in hla_la_alleles:
                matched_alleles += 1

    # Calculate concordance rates based on matched alleles considering each allele separately
    concordance_rate = matched_alleles / total_alleles if total_alleles > 0 else 0
    return concordance_rate, len(df_filtered), matched_alleles

# Assume 'HLA' is your DataFrame and it has been properly loaded
genes = ['A', 'B', 'C', 'DPB1', 'DRB1', 'DQA1', 'DQB1']
results = {gene: calculate_concordance(HLA, gene) for gene in genes}

# Setup figure for subplots
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 16))
axes = axes.flatten()

# Plot each gene's Venn diagram
for i, gene in enumerate(genes):
    rate, samples, matches = results[gene]
    v = venn2(subsets=(samples*2 - matches, samples*2 - matches, matches),
              set_labels=(f'HiBAG {gene}', f'HLA-LA {gene}'),
              ax=axes[i])
    axes[i].text(0.5, 1.05, f'Total Matches: {matches}\nRate: {rate:.2%}',
                 ha='center', va='bottom', fontsize=16, transform=axes[i].transAxes)
    axes[i].set_title(f'{gene} Concordance')

# Hide unused axes if any
for ax in axes[len(genes):]:
    ax.set_visible(False)

plt.tight_layout()
plt.savefig('hla_concordance_venn_diagrams.pdf')
plt.show()


# check HLA-DQA1 mismatch

In [None]:
import pandas as pd

# Assuming 'HLA' is your DataFrame and it has been properly loaded
# Filter out rows where any of the DQA1 columns for both methods has NA, and explicitly create a copy to avoid setting with copy warnings
alleles_filtered = HLA.dropna(subset=['hibag.DQA1.1', 'hibag.DQA1.2', 'hla.la.DQA1.1', 'hla.la.DQA1.2']).copy()

def count_matched_alleles(row):
    # Extract allele sets for HiBag and HLA-LA
    hibag_set = {row['hibag.DQA1.1'], row['hibag.DQA1.2']}
    hla_la_set = {row['hla.la.DQA1.1'], row['hla.la.DQA1.2']}
    
    # Calculate intersection to determine the number of matches
    match_count = len(hibag_set.intersection(hla_la_set))
    
    # Return the number of matches (0, 1, 2)
    return match_count

# Apply the function to each row to determine the number of matching alleles
alleles_filtered['Match Count'] = alleles_filtered.apply(count_matched_alleles, axis=1)

# Export the updated DataFrame to a CSV file
alleles_filtered.to_csv('alleles_matched_count.csv', index=False)
print("Data with matched counts has been exported to 'alleles_matched_count.csv'.")



# Check the frequent mistake that HIBAG might have based on mismatch is one

In [None]:
import pandas as pd

# Assuming 'HLA' is your DataFrame and it has been properly loaded
# Filter out rows where any of the DQA1 columns for both methods has NA, and explicitly create a copy to avoid setting with copy warnings
alleles_filtered = HLA.dropna(subset=['hibag.DQA1.1', 'hibag.DQA1.2', 'hla.la.DQA1.1', 'hla.la.DQA1.2']).copy()

def count_matched_alleles(row):
    # Extract allele sets for HiBag and HLA-LA
    hibag_set = {row['hibag.DQA1.1'], row['hibag.DQA1.2']}
    hla_la_set = {row['hla.la.DQA1.1'], row['hla.la.DQA1.2']}
    
    # Calculate intersection to determine the number of matches
    intersection = hibag_set.intersection(hla_la_set)
    match_count = len(intersection)
    
    # Prepare data for mismatches when match_count is 1
    mismatches_info = []
    if match_count == 1:
        matched_allele = intersection.pop()  # Remove the matched allele from consideration
        remaining_hibag = hibag_set - {matched_allele}
        remaining_hla_la = hla_la_set - {matched_allele}
        
        # Collect unmatched alleles with their corresponding unmatched set
        for allele in remaining_hibag:
            mismatches_info.append(f"{allele} (Unmatched HLA-LA: {', '.join(remaining_hla_la)})")

    # Return the number of matches and the mismatched alleles info
    return pd.Series([match_count, '; '.join(mismatches_info) if mismatches_info else ''])

# Apply the function to each row to determine the number of matching alleles and collect mismatches
alleles_filtered[['Match Count', 'Mismatch Info']] = alleles_filtered.apply(count_matched_alleles, axis=1)

# Filter to get mismatches where Match Count is 1
mismatches_info = alleles_filtered[alleles_filtered['Match Count'] == 1]['Mismatch Info']

# Assuming mismatches could be empty if no matches of 1 are found
if not mismatches_info.empty:
    # Count the frequency of each specific mismatch case
    mismatch_counts = mismatches_info.value_counts()

    # Export the mismatch counts to a CSV file
    mismatch_counts.to_csv('mismatch_frequencies_detailed.csv')
    print("Mismatch frequency data with detailed unmatched HLA-LA alleles has been exported to 'mismatch_frequencies_detailed.csv'.")
else:
    print("No mismatch data found for the case where Match Count is 1.")


# Test if any frequent combination missed from Hibag XX sample

In [None]:
import pandas as pd

# Assuming you've loaded your DataFrame, for example:
# HLA = pd.read_csv('your_data.csv')

# Filter to find rows with the specific allele combination
specific_combination = HLA[
    ((HLA['hibag.DQA1.1'] == '5:01:00') | (HLA['hibag.DQA1.2'] == '5:01:00')) &
    ((HLA['hibag.DQB1.1'] == '3:01:00') | (HLA['hibag.DQB1.2'] == '3:01:00'))
]

# Count the number of rows that match this criteria
combination_count = len(specific_combination)
print(f"The number of samples with HLA-DQA1*05:01 and HLA-DQB1*03:01 is: {combination_count}")

# Optionally, export the filtered DataFrame to a CSV file
specific_combination.to_csv('specific_allele_combination_samples.csv', index=False)
print("Filtered data has been exported to 'specific_allele_combination_samples.csv'.")


In [None]:
import pandas as pd

# Assuming you've loaded your DataFrame, for example:
# HLA = pd.read_csv('your_data.csv')

# Define the specific alleles for DQA1 and DQB1
dqa1_alleles = ['5:01:00', '5:05:00']
dqb1_alleles = ['2:01:00', '2:02:00']

# Filter to find rows with the specific allele combinations
specific_combination = HLA[
    (HLA['hibag.DQA1.1'].isin(dqa1_alleles) | HLA['hibag.DQA1.2'].isin(dqa1_alleles)) &
    (HLA['hibag.DQB1.1'].isin(dqb1_alleles) | HLA['hibag.DQB1.2'].isin(dqb1_alleles))
]

# Count the number of rows that match these criteria
combination_count = len(specific_combination)
print(f"The number of samples with specified HLA-DQA1 and HLA-DQB1 alleles is: {combination_count}")


In [None]:
import pandas as pd

# Assuming you've loaded your DataFrame, for example:
# HLA = pd.read_csv('your_data.csv')

# Define the specific alleles for DQA1 and DQB1
dqa1_alleles = ['5:01:00', '5:05:00']
dqb1_alleles = ['3:01:00', '2:02:00','2:01:00']

# Filter to find rows with the genotype 'XX' and the specific allele combinations
specific_combination = HLA[
    (HLA['hibag.genotype'] == 'X/X') &  HLA['hibag.genotype'].notna() &
    (HLA['hibag.DQA1.1'].isin(dqa1_alleles) | HLA['hibag.DQA1.2'].isin(dqa1_alleles)) &
    (HLA['hibag.DQB1.1'].isin(dqb1_alleles) | HLA['hibag.DQB1.2'].isin(dqb1_alleles))
]

# Count the number of rows that match these criteria
combination_count = len(specific_combination)
print(f"The number of XX genotype samples with specified HLA-DQA1 and HLA-DQB1 alleles is: {combination_count}")


In [None]:
hibag_xx = HLA[(HLA['hibag.genotype'] == 'X/X') & HLA['hibag.genotype'].notna()]


# This is the one to make all the HLA calls

In [None]:
import pandas as pd

# Assuming your DataFrame HLA is already loaded
# For example: HLA = pd.read_csv('your_data.csv')

def check_combinations(row):
    # Initialize the new columns
    row['DQ2.5'], row['DQ2.2'], row['DQ7.5'], row['DQ8.1'] = 'negative', 'negative', 'negative', 'negative'
    
    # Condition for DQ2.5
    if (('5:01:00' in [row['hibag.DQA1.1'], row['hibag.DQA1.2']] and '2:01:00' in [row['hibag.DQB1.1'], row['hibag.DQB1.2']]) or
        ('5:01:00' in [row['hla.la.DQA1.1'], row['hla.la.DQA1.2']] and '2:01:00' in [row['hla.la.DQB1.1'], row['hla.la.DQB1.2']])):
        row['DQ2.5'] = 'positive'
        
    # Condition for DQ2.2
    if ('2:02:00' in [row['hibag.DQB1.1'], row['hibag.DQB1.2']] and 
        ('3:01:00' in [row['hibag.DQA1.1'], row['hibag.DQA1.2']] or '2:01:00' in [row['hibag.DQA1.1'], row['hibag.DQA1.2']])):
        row['DQ2.2'] = 'positive'

    # Condition for DQ7.5
    if (('5:01:00' in [row['hibag.DQA1.1'], row['hibag.DQA1.2']] or '5:05:00' in [row['hibag.DQA1.1'], row['hibag.DQA1.2']]) and
        '3:01:00' in [row['hibag.DQB1.1'], row['hibag.DQB1.2']]):
        row['DQ7.5'] = 'positive'

    # Condition for DQ8.1
    if ((('3:01:00' in [row['hibag.DQA1.1'], row['hibag.DQA1.2']] or '3:03:00' in [row['hibag.DQA1.1'], row['hibag.DQA1.2']] )and '3:02:00' in [row['hibag.DQB1.1'], row['hibag.DQB1.2']]) or
        (row['hla.la.DQA1.2'] == '3:01:00' and row['hla.la.DQB1.1'] == '3:02:00')):
        row['DQ8.1'] = 'positive'
    
    # Combining genotype risks into a single column
    genotypes = []
    if row['DQ2.5'] == 'positive':
        genotypes.append('2.5')
    if row['DQ2.2'] == 'positive':
        genotypes.append('2.2')
    if row['DQ7.5'] == 'positive':
        genotypes.append('7.5')
    if row['DQ8.1'] == 'positive':
        genotypes.append('8.1')
    
    row['genotype'] = '/'.join(genotypes) if genotypes else 'X'
    
    return row

# Apply the function across the DataFrame
HLA = HLA.apply(check_combinations, axis=1)

# Define columns to keep
columns_to_keep = [
    'person_id', 'CeD', 'race', 'ethnicity', 'sex_at_birth', 'age', 'tag.genotype',
    'hibag.DQA1.1', 'hibag.DQA1.2', 'hibag.DQB1.1', 'hibag.DQB1.2', 
    'hibag.DRB1.1', 'hibag.DRB1.2', 'hibag.genotype', 
    'hla.la.DQA1.1', 'hla.la.DQA1.2', 'hla.la.DQB1.1', 'hla.la.DQB1.2', 
    'hla.la.DRB1.1', 'hla.la.DRB1.2', 'hla.la.genotype', 
    'DQ2.5', 'DQ2.2', 'DQ7.5', 'DQ8.1', 'genotype'
]

# Select only the specified columns
final_data = HLA[columns_to_keep]

# Export the selected columns and filtered data to a CSV file
final_data.to_csv('final_filtered_data.csv', index=False)
print("Final filtered data has been exported to 'final_filtered_data.csv'.")

# Analyze HLA data for the manuscript

In [None]:
file_path2 = 'final_filtered_data.csv'
HLA = pd.read_csv(file_path2)

In [None]:
HLA.loc[HLA['hibag.genotype'] == 'DQ2.5/DQ2.5', 'XF_genotype'] = '2.5/2.5'

In [None]:
HLA.loc[HLA['hibag.genotype'] == 'DQ7.5/DQ7.5', 'XF_genotype'] = '7.5/7.5'

In [None]:
print(HLA[HLA['hibag.genotype'] == 'DQ7.5/DQ7.5'][['hibag.genotype', 'XF_genotype']])

In [None]:
HLA.loc[HLA['hibag.genotype'] == 'DQ2.2/DQ2.2', 'XF_genotype'] = '2.2/2.2'

In [None]:
print(HLA[HLA['hibag.genotype'] == 'DQ2.2/DQ2.2'][['hibag.genotype', 'XF_genotype']])

In [None]:
HLA.loc[HLA['hibag.genotype'] == 'DQ8/DQ8', 'XF_genotype'] = '8.1/8.1'
print(HLA[HLA['hibag.genotype'] == 'DQ8/DQ8'][['hibag.genotype', 'XF_genotype']])

In [None]:
genotype_mapping = {
    'X': 'X/X',
    '8.1': '8.1/X',
    '2.5': '2.5/X',
    '7.5': '7.5/X',
    '2.2': '2.2/X',
    '2.5/2.2/7.5': '2.2/7.5'
}

In [None]:
# Apply the mapping to the 'XF_genotype' column
HLA['XF_genotype'] = HLA['XF_genotype'].replace(genotype_mapping)

# Optionally, verify the updates or check some of the transformed entries
print(HLA[['hibag.genotype', 'XF_genotype']].head())

In [None]:
#ancestry
ancestry = pd.read_csv(f'{bucket}/data/ancestry_preds.tsv', sep='\t')
ancestry.rename(columns={'research_id':'person_id'},inplace=True)
HLA = pd.merge(HLA, ancestry[['person_id','ancestry_pred']], on='person_id', how='inner')

demo = pd.read_csv('ced_matched_data_v2.csv')
cols_pc=[col for col in demo.columns if col.startswith('PC')]
cols_pc.append(['person_id'])
HLA = pd.merge(HLA,demo,on='person_id')

In [None]:
# Save the changes back to a CSV file if needed
HLA.to_csv('updated_HLA.csv', index=False)
print("Updated data has been exported to 'updated_HLA.csv'.")