# Get HLA imputation table
## For use in regression, we need to get observations X (N, H) matrices, and Y (N, T) matrices.
####  N is the number of participants in our study. H can be one of three things: 4-digit MHC haplotypes, SNPs in the MHC region, and AAs in the classical MHC genes (e.g. DQB1). T is the number of V/J genes whose expression we are interested in Thus X holds genotypes for every individual, and Y holds the expression values of V/J genes for every individual.

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import statsmodels.api as sm

## Step 1: setup Y (N,T) response matrix from expression data

In [80]:
vdj_df = pd.read_csv("DGN_vdj_usages.csv")
vdj_table = pd.read_csv("DGN_vdj_usages_table.csv")

In [57]:
# """ONLY GET TRAJ, TRAV, TRBV"""
# query = "TRA"
# vdj_df = vdj_df[vdj_df['family'].str.startswith(query)]
# TODO: also filter vdj_table

In [58]:
# ## plotting usage across families
# f, ax = plt.subplots(figsize=(5,30))
# sns.boxplot(y="family", x="usage_ratio", data=vdj_df)
# plt.xlim((0,0.5))
# plt.title("Non Protein Coding OR Non Functional Genes")

## Step 2: Setup X (N, H) observation matrix from imputed genotype data

### In the bim file output of the SNP2HLA program, there are the following types of names of entities:

rs1611715 - intergenic SNP

1kg_6_xxxxxx - intergenic SNP from 1000 Genomes

SNP_A_30018316 - biallelic SNP within MHC gene

SNP_A_30018461_G - multiallelic SNP within MHC gene

AA_A_-22_30018317_V 0 multiallelis AA within MHC gene (can have multiple AAs conjugated, or an 'x' for deletion)

HLA_A_29 / HLA_A_2901 - binary present/absent of MHC haplotype

in the minor/major allele columns, for binary items we have P/A, and for multiallelic we have either the DNA letters or the AA letters

In [83]:
bim_path = "/labs/mignot/DGN/Genotypes_DGN/DGNGeno.bim"

In [84]:
fam_path = "/labs/mignot/DGN/Genotypes_DGN/DGNGeno.fam"

In [85]:
dosage_path = "/labs/mignot/DGN/Genotypes_DGN/DGNGeno.dosage" # each person number between 0 and 2

In [86]:
fam_df = pd.read_csv(fam_path, header=None, delim_whitespace=True) # the column 5 probably is depression/not

In [87]:
filtered_fam_df = fam_df[fam_df[0].apply(lambda name: name.split("_")[-1].startswith("LD"))] # the first 937 patients start with "LD"

In [88]:
INCLUDE_INDICES = np.arange(0,937) # indices of people to include in our study

In [89]:
df = pd.read_csv(dosage_path, header=None, delimiter="\t")

In [90]:
subject_names = list(fam_df[0].apply(lambda name: name.split("_")[-1]).values)

In [91]:
df.columns = ["genotype", "minor", "major"] + subject_names

In [92]:
df = df.iloc[:,:3+937] # filter out the non LD people

In [93]:
df.columns = ["genotype", "minor", "major"] + [name for name in list(df.columns[3:])] # convert LDxxxx to xxxx

In [94]:
df.head()

Unnamed: 0,genotype,minor,major,LD0014,LD0041,LD0038,LD0084,LD0022,LD0033,LD0008,...,LD1144,LD1148,LD1115,LD1357,LD1291,LD1282,LD1271,LD1252,LD0165,LD0102
0,rs969931,C,A,1.993,1.993,1.994,1.993,1.658,1.994,1.993,...,1.993,1.993,1.962,1.993,1.961,1.993,1.996,1.961,1.993,1.993
1,rs2745406,C,T,1.993,1.994,1.994,1.994,1.658,1.994,1.994,...,1.993,1.994,1.962,1.994,1.96,1.994,1.997,1.96,1.994,1.994
2,rs6939431,A,G,0.015,0.081,0.287,0.035,0.021,0.287,0.081,...,0.015,0.081,0.009,0.081,0.073,0.081,1.008,0.073,0.081,0.081
3,rs1233427,A,G,1.993,1.993,1.994,1.993,1.643,1.994,1.993,...,1.993,1.993,1.952,1.993,1.951,1.993,1.996,1.951,1.993,1.993
4,rs1233426,A,G,1.993,1.993,1.994,1.993,1.643,1.994,1.993,...,1.993,1.993,1.95,1.993,1.951,1.993,1.996,1.951,1.993,1.993


In [98]:
# pick out the columns that are actually present in our Y matrix
wanted_columns = ["genotype", "minor", "major"] + list(vdj_table['patid'])

df = df.loc[:, wanted_columns]
df.head()

Unnamed: 0,genotype,minor,major,LD0001,LD0002,LD0003,LD0006,LD0007,LD0008,LD0009,...,LD1349,LD1350,LD1353,LD1354,LD1356,LD1357,LD1361,LD1362,LD1364,LD1366
0,rs969931,C,A,1.993,1.994,1.996,1.993,1.994,1.993,1.993,...,1.993,1.993,1.96,1.996,1.962,1.993,1.993,1.993,1.993,1.993
1,rs2745406,C,T,1.994,1.994,1.997,1.993,1.994,1.994,1.994,...,1.994,1.993,1.961,1.997,1.962,1.994,1.993,1.993,1.993,1.994
2,rs6939431,A,G,0.081,0.219,1.008,0.015,0.287,0.081,0.081,...,0.081,0.015,0.073,1.008,0.009,0.081,0.015,0.015,0.015,0.081
3,rs1233427,A,G,1.993,1.993,1.996,1.993,1.994,1.993,1.993,...,1.993,1.993,1.95,1.996,1.95,1.993,1.993,1.993,1.993,1.993
4,rs1233426,A,G,1.993,1.993,1.996,1.993,1.994,1.993,1.993,...,1.993,1.993,1.95,1.996,1.95,1.993,1.993,1.993,1.993,1.993


In [99]:
def is_HLA(name):
    name_tokens = name.split("_")
    if name_tokens[0] == "HLA" and len(name_tokens[-1]) == 4: # only want 4-digit HLA codes
        return True
    return False

In [100]:
HLA_df = df[df['genotype'].apply(lambda name: is_HLA(name))]

In [101]:
HLA_df.head()

Unnamed: 0,genotype,minor,major,LD0001,LD0002,LD0003,LD0006,LD0007,LD0008,LD0009,...,LD1349,LD1350,LD1353,LD1354,LD1356,LD1357,LD1361,LD1362,LD1364,LD1366
842,HLA_A_0101,P,A,0.001,0.0,0.002,0.0,0.001,0.001,0.001,...,0.0,0.0,0.008,0.0,0.004,0.004,0.004,0.008,0.0,0.0
843,HLA_A_0102,P,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
844,HLA_A_0103,P,A,0.02,0.019,0.048,0.008,0.022,0.022,0.045,...,0.014,0.012,0.323,0.002,0.17,0.18,0.169,0.33,0.021,0.019
846,HLA_A_0201,P,A,1.886,1.892,1.728,0.97,1.874,1.874,1.756,...,0.937,0.934,0.286,0.986,1.093,1.052,1.103,0.441,0.83,1.892
847,HLA_A_0202,P,A,0.009,0.008,0.021,0.003,0.01,0.01,0.019,...,0.006,0.005,0.053,0.001,0.031,0.033,0.03,0.054,0.005,0.008


In [103]:
# drop rows where all HLA haplotype call summed up to 0 for everyone. Nobody had these genotypes.

HLA_df = HLA_df.drop(HLA_df[HLA_df.iloc[:, 3:].sum(axis=1) == 0].index)

In [104]:
dqb301_arr = np.round(HLA_df[HLA_df['genotype'] == "HLA_DQB1_0301"].iloc[:,3:].values[0], 0)

In [107]:
dqb602_arr = np.round(HLA_df[HLA_df['genotype'] == "HLA_DQB1_0602"].iloc[:,3:].values[0], 0)

In [108]:
summed_arr = dqb301_arr + dqb602_arr

In [109]:
np.sum(summed_arr == 2)/len(summed_arr) # proportion of people with both 0301 and 0602

0.22681564245810057

In [110]:
HLA_names = HLA_df['genotype'].values

In [111]:
HLA_df = HLA_df.drop(['minor', 'major'], axis=1).set_index("genotype").transpose()

In [115]:
HLA_df = HLA_df.reset_index().rename(columns={'index':'patid'})

In [116]:
assert list(HLA_df['patid']) == list(vdj_table['patid'])

In [117]:
HLA_df.to_csv("DGN_HLA_df.csv", index=None)