# Get HLA imputation table
####  N is the number of participants in our study. H can be one of three things: 4-digit MHC haplotypes, SNPs in the MHC region, and AAs in the classical MHC genes (e.g. DQB1). T is the number of V/J genes whose expression we are interested in Thus X holds genotypes for every individual, and Y holds the expression values of V/J genes for every individual.

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import statsmodels.api as sm

## USING HIBAG IMPUTATION RESULTS

In [26]:
dqa1_path = "DQA1_JUN15_2020_DGN.csv"
dqb1_path = "DQB1_JUN15_2020_DGN.csv"

In [27]:
dqa1_df = pd.read_csv(dqa1_path)
dqb1_df = pd.read_csv(dqb1_path)

In [28]:
dqa1_df.columns = ["patid"] + ["DQA1*"+col for col in dqa1_df.columns[1:]]
dqb1_df.columns = ["patid"] + ["DQB1*"+col for col in dqb1_df.columns[1:]]

In [32]:
df = pd.merge(dqa1_df, dqb1_df, on='patid', how="inner")

In [35]:
df = df[df['patid'].apply(lambda name: name.split("_")[-1].startswith("LD"))]

In [38]:
df['patid'] = df['patid'].apply(lambda name: name.split("_")[-1]).values

In [40]:
# only want patids in Sharon's paper, N=895, due to quality control
sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
sharon_counts_df = sharon_counts_df.reset_index()
sharon_ids = set(sharon_counts_df.columns[1:])
df = df[df['patid'].isin(sharon_ids)] 

In [42]:
df.to_csv("DGN_HLA_df.csv", index=None)

In [44]:
df

Unnamed: 0,patid,DQA1*01:01,DQA1*01:02,DQA1*01:03,DQA1*01:04,DQA1*01:05,DQA1*02:01,DQA1*03:01,DQA1*03:02,DQA1*03:03,...,DQB1*04:02,DQB1*05:01,DQB1*05:02,DQB1*05:03,DQB1*05:04,DQB1*06:01,DQB1*06:02,DQB1*06:03,DQB1*06:04,DQB1*06:09
0,LD0014,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,LD0041,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,LD0038,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,LD0084,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,LD0022,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932,LD1282,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
933,LD1271,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
934,LD1252,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
935,LD0165,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


## SNP2HLA IMPUTATION RESULTS, DEPRECATED

### In the bim file output of the SNP2HLA program, there are the following types of names of entities:

rs1611715 - intergenic SNP

1kg_6_xxxxxx - intergenic SNP from 1000 Genomes

SNP_A_30018316 - biallelic SNP within MHC gene

SNP_A_30018461_G - multiallelic SNP within MHC gene

AA_A_-22_30018317_V 0 multiallelis AA within MHC gene (can have multiple AAs conjugated, or an 'x' for deletion)

HLA_A_29 / HLA_A_2901 - binary present/absent of MHC haplotype

in the minor/major allele columns, for binary items we have P/A, and for multiallelic we have either the DNA letters or the AA letters

In [5]:
bim_path = "/labs/mignot/DGN/Genotypes_DGN/DGNGeno.bim"

In [6]:
fam_path = "/labs/mignot/DGN/Genotypes_DGN/DGNGeno.fam"

In [7]:
dosage_path = "/labs/mignot/DGN/Genotypes_DGN/DGNGeno.dosage" # each person number between 0 and 2

In [8]:
fam_df = pd.read_csv(fam_path, header=None, delim_whitespace=True) # the column 5 probably is depression/not

In [9]:
filtered_fam_df = fam_df[fam_df[0].apply(lambda name: name.split("_")[-1].startswith("LD"))] # the first 937 patients start with "LD"

In [11]:
df = pd.read_csv(dosage_path, header=None, delimiter="\t")

In [12]:
subject_names = list(fam_df[0].apply(lambda name: name.split("_")[-1]).values)

In [13]:
df.columns = ["genotype", "minor", "major"] + subject_names

In [15]:
df = df.iloc[:,:3+937] # filter out the non LD people

In [16]:
df.columns = ["genotype", "minor", "major"] + [name for name in list(df.columns[3:])] # convert LDxxxx to xxxx

In [17]:
df.head()

Unnamed: 0,genotype,minor,major,LD0014,LD0041,LD0038,LD0084,LD0022,LD0033,LD0008,...,LD1144,LD1148,LD1115,LD1357,LD1291,LD1282,LD1271,LD1252,LD0165,LD0102
0,rs969931,C,A,1.993,1.993,1.994,1.993,1.658,1.994,1.993,...,1.993,1.993,1.962,1.993,1.961,1.993,1.996,1.961,1.993,1.993
1,rs2745406,C,T,1.993,1.994,1.994,1.994,1.658,1.994,1.994,...,1.993,1.994,1.962,1.994,1.96,1.994,1.997,1.96,1.994,1.994
2,rs6939431,A,G,0.015,0.081,0.287,0.035,0.021,0.287,0.081,...,0.015,0.081,0.009,0.081,0.073,0.081,1.008,0.073,0.081,0.081
3,rs1233427,A,G,1.993,1.993,1.994,1.993,1.643,1.994,1.993,...,1.993,1.993,1.952,1.993,1.951,1.993,1.996,1.951,1.993,1.993
4,rs1233426,A,G,1.993,1.993,1.994,1.993,1.643,1.994,1.993,...,1.993,1.993,1.95,1.993,1.951,1.993,1.996,1.951,1.993,1.993


In [18]:
# pick out the columns that are actually present in our Y matrix
wanted_columns = ["genotype", "minor", "major"] + list(vdj_table['patid'])

df = df.loc[:, wanted_columns]
df.head()

Unnamed: 0,genotype,minor,major,LD0001,LD0002,LD0003,LD0006,LD0007,LD0008,LD0009,...,LD1349,LD1350,LD1353,LD1354,LD1356,LD1357,LD1361,LD1362,LD1364,LD1366
0,rs969931,C,A,1.993,1.994,1.996,1.993,1.994,1.993,1.993,...,1.993,1.993,1.96,1.996,1.962,1.993,1.993,1.993,1.993,1.993
1,rs2745406,C,T,1.994,1.994,1.997,1.993,1.994,1.994,1.994,...,1.994,1.993,1.961,1.997,1.962,1.994,1.993,1.993,1.993,1.994
2,rs6939431,A,G,0.081,0.219,1.008,0.015,0.287,0.081,0.081,...,0.081,0.015,0.073,1.008,0.009,0.081,0.015,0.015,0.015,0.081
3,rs1233427,A,G,1.993,1.993,1.996,1.993,1.994,1.993,1.993,...,1.993,1.993,1.95,1.996,1.95,1.993,1.993,1.993,1.993,1.993
4,rs1233426,A,G,1.993,1.993,1.996,1.993,1.994,1.993,1.993,...,1.993,1.993,1.95,1.996,1.95,1.993,1.993,1.993,1.993,1.993


In [20]:
def is_HLA(name):
    name_tokens = name.split("_")
    if "HLA"in name_tokens:
        return True
    #if name_tokens[0] == "HLA" and len(name_tokens[-1]) == 4: # only want 4-digit HLA codes
    #    return True
    return False

In [21]:
HLA_df = df[df['genotype'].apply(lambda name: is_HLA(name))]

In [22]:
HLA_df.head()

Unnamed: 0,genotype,minor,major,LD0001,LD0002,LD0003,LD0006,LD0007,LD0008,LD0009,...,LD1349,LD1350,LD1353,LD1354,LD1356,LD1357,LD1361,LD1362,LD1364,LD1366
841,HLA_A_01,P,A,0.021,0.02,0.049,0.008,0.022,0.022,0.046,...,0.014,0.012,0.329,0.002,0.174,0.183,0.172,0.336,0.022,0.02
842,HLA_A_0101,P,A,0.001,0.0,0.002,0.0,0.001,0.001,0.001,...,0.0,0.0,0.008,0.0,0.004,0.004,0.004,0.008,0.0,0.0
843,HLA_A_0102,P,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
844,HLA_A_0103,P,A,0.02,0.019,0.048,0.008,0.022,0.022,0.045,...,0.014,0.012,0.323,0.002,0.17,0.18,0.169,0.33,0.021,0.019
845,HLA_A_02,P,A,1.916,1.922,1.809,0.98,1.909,1.909,1.827,...,0.957,0.952,0.489,0.99,1.213,1.18,1.219,0.651,0.85,1.922


In [103]:
# drop rows where all HLA haplotype call summed up to 0 for everyone. Nobody had these genotypes.

HLA_df = HLA_df.drop(HLA_df[HLA_df.iloc[:, 3:].sum(axis=1) == 0].index)

In [104]:
dqb301_arr = np.round(HLA_df[HLA_df['genotype'] == "HLA_DQB1_0301"].iloc[:,3:].values[0], 0)

In [107]:
dqb602_arr = np.round(HLA_df[HLA_df['genotype'] == "HLA_DQB1_0602"].iloc[:,3:].values[0], 0)

In [108]:
summed_arr = dqb301_arr + dqb602_arr

In [109]:
np.sum(summed_arr == 2)/len(summed_arr) # proportion of people with both 0301 and 0602

0.22681564245810057

In [110]:
HLA_names = HLA_df['genotype'].values

In [111]:
HLA_df = HLA_df.drop(['minor', 'major'], axis=1).set_index("genotype").transpose()

In [115]:
HLA_df = HLA_df.reset_index().rename(columns={'index':'patid'})

In [116]:
assert list(HLA_df['patid']) == list(vdj_table['patid'])

In [117]:
HLA_df.to_csv("DGN_HLA_df.csv", index=None)