In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.manifold import TSNE

In [None]:
file_path2 = '20240612_HLA2.csv'
HLA = pd.read_csv(file_path2)

## Chi Square for updated HLA types

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

file_path2 = '20240612_HLA2.csv'
HLA = pd.read_csv(file_path2)

# Assuming your DataFrame is already loaded into `HLA`
# Correct the mapping of 'CeD' to be binary
HLA['CeD'] = HLA['CeD'].map({2: 0, 1: 1})

# Create a contingency table with frequencies
contingency_table = pd.crosstab(HLA['DQ_geno_4'], HLA['CeD'])

# Calculate percentages within each CeD group (column-wise percentages)
percentage_table = contingency_table.div(contingency_table.sum(axis=0), axis=1) * 100

# Format the contingency table to "count (percentage%)"
formatted_table = contingency_table.astype(str) + " (" + percentage_table.round(2).astype(str) + "%)"

# Initialize an empty DataFrame for results
results = pd.DataFrame(columns=["Genotype", "Odds Ratio", "P-value"])
for category in contingency_table.index:
    # Create the 2x2 table for each category
    c_table = contingency_table.loc[[category]]
    if c_table.shape[1] < 2:  # Ensures there are two columns
        c_table = pd.concat([c_table, pd.DataFrame([[0, 0]], columns=c_table.columns)], ignore_index=True)
    complement = pd.DataFrame(contingency_table.drop(category).sum()).T
    if complement.shape[1] < 2:
        complement = pd.concat([complement, pd.DataFrame([[0, 0]], columns=complement.columns)], ignore_index=True)
    full_table = pd.concat([c_table, complement], ignore_index=True)

    # Calculate Chi-square test
    chi2, p, dof, expected = chi2_contingency(full_table)

    # Calculate Odds Ratio
    if full_table.iloc[0, 0] != 0 and full_table.iloc[1, 1] != 0:
        OR = (full_table.iloc[0, 0] / full_table.iloc[1, 0]) / (full_table.iloc[0, 1] / full_table.iloc[1, 1])
        OR = f"{OR:.2f}"
    else:
        OR = "Not calculable"

    # Construct new row for results DataFrame
    new_row = pd.DataFrame([[category, OR, f"{p:.2e}"]], columns=["Genotype", "Odds Ratio", "P-value"])
    results = pd.concat([results, new_row], ignore_index=True)

# Print formatted tables
print("Formatted Contingency Table:")
print(formatted_table)
print("\nAnalysis Results:")
print(results)

# Export to CSV
formatted_table.to_csv("Formatted_Contingency_Table.csv")
results.to_csv("Analysis_Results.csv")


## DQ genotypes by inclusion criteria

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['pdf.fonttype']=42

# Group by 'diagnosis domain' and 'DQ_geno', then count the occurrences
geno_counts = HLA.groupby(['diagnosis domain', 'DQ_geno_4']).size().unstack(fill_value=0)

# Normalize the counts to get percentages
geno_percentages = geno_counts.div(geno_counts.sum(axis=1), axis=0) * 100

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))  # Adjust the figure size as needed
geno_percentages.plot(kind='bar', stacked=True, ax=ax, colormap='tab20')

# Adding labels and title
ax.set_ylabel('Percentage')
ax.set_xlabel('Diagnosis Domain')
ax.set_title('Distribution of DQ_geno Across Diagnosis Domains')
ax.legend(['2.5/2.5','2.5/2.2','2.5/7.5','2.5/8.1','2.5/X','2.2/7.5','2.2/2.2','2.2/8.1', '2.2/X', '7.5/7.5', '7.5/8.1', '7.5/X','8.1/8.1',
       '8.1/X', 'X/X'] , bbox_to_anchor=(1.05, 1), loc='upper left')

# Improve layout to accommodate legend
plt.tight_layout()
plt.savefig('S.Figure 8.pdf',format='pdf')

## DQ genotype by ancestry

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams

# Load the dataset
file_path2 = '20240612_HLA.csv'
HLA = pd.read_csv(file_path2)

# Define significant genotypes
significant_genotypes = ['2.2/7.5', '2.5/2.2', '2.5/2.5', '2.5/7.5', '2.5/8.1', '2.5/X']

# Map CeD values to statuses
HLA['CeD_Status'] = HLA['CeD'].map({1: 'Control', 2: 'Case'})

# Update ancestries 'sas', 'mid', 'eas' to 'Other'
HLA['ancestry_pred'] = HLA['ancestry_pred'].replace(['sas', 'mid', 'eas'], 'Other')

# Create dummies for all genotypes
dummies = pd.get_dummies(HLA['DQ_geno_4'])
HLA = pd.concat([HLA, dummies], axis=1)

# Group by 'ancestry_pred', 'CeD_Status' and sum occurrences for each genotype
geno_counts = HLA.groupby(['ancestry_pred', 'CeD_Status'])[dummies.columns].sum()

# Normalize the counts to get percentages
geno_percentages = geno_counts.div(geno_counts.sum(axis=1), axis=0) * 100

# Define a distinct color map for all genotypes
all_colors = plt.get_cmap('tab20')(np.linspace(0, 1, len(dummies.columns)))
color_map = {geno: all_colors[i % len(all_colors)] for i, geno in enumerate(dummies.columns)}

# Set a specific color for X/X genotype
color_map['X/X'] = 'yellow'  # Assign yellow color specifically for the X/X genotype

# Setting the font globally
rcParams['font.family'] = 'Arial'
rcParams['font.size'] = 16

# Define odds ratios for sorting
odds_ratios = {
    '2.2/2.2': 0.50, '2.2/7.5': 2.14, '2.2/8.1': 1.21, '2.2/X': 0.66,
    '2.5/2.2': 3.62, '2.5/2.5': 4.34, '2.5/7.5': 1.47, '2.5/8.1': 2.06,
    '2.5/X': 2.16, '7.5/7.5': 0.20, '7.5/8.1': 0.54, '7.5/X': 0.61,
    '8.1/8.1': 1.48, '8.1/X': 0.88, 'X/X': 0.46
}

# Sort columns of case_data by odds ratios in descending order
case_data = geno_percentages.xs('Case', level='CeD_Status')
sorted_columns = sorted(case_data.columns, key=lambda x: odds_ratios.get(x, 0), reverse=True)
case_data = case_data[sorted_columns]

# Plotting
fig, ax = plt.subplots(figsize=(14, 8))
case_data.plot(kind='barh', stacked=True, ax=ax, color=[color_map[col] for col in case_data.columns])
ax.set_title('HLA-DQ Distribution Across Genetic Ancestries')
ax.set_xlabel('Percentage')
ax.set_ylabel('Ancestry and Genotype')
ax.legend(title='HLA-DQ', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig('geno_distribution_cases.pdf')
plt.show()
