In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection

In [2]:
HLA_table = pd.read_csv("DGN_HLA_df.csv").set_index("patid").apply(lambda x: x > 0).astype("float")

In [3]:
family_counts_table = pd.read_csv("DGN_family_counts_table.csv")
family_counts_table = family_counts_table.set_index("patid")

In [4]:
family_counts_table = family_counts_table[[x for x in family_counts_table.columns if x.startswith("TRAV")]]

In [5]:
tcr_columns = list(family_counts_table.columns)

In [6]:
CDR_NUM_AA = 22 # 22 positions, first 12 are CDR1, then 10 are CDR2
DGN_NUM_PATIENTS = 895

In [7]:
TRV = "TRAV" # TRAV

In [8]:
positions_AA_count_dfs = []
for i in range(CDR_NUM_AA):
    positions_AA_count_dfs.append(pd.read_csv("DGN_{}_CDR1_CDR2_position_{}_AA_counts_table.csv".format(TRV, i)).set_index("patid"))

## Using family counts and TRAV sequences, for every patient, get count of each AA at each position of CDR2

In [9]:
def counts_to_usage_table(counts_table, drop_columns={}):
    counts_table = counts_table.drop(axis=1, labels=drop_columns)
    usage_table = counts_table.div(counts_table.sum(axis=1), axis=0)
    return usage_table

In [10]:
usage_table = counts_to_usage_table(family_counts_table)

In [11]:
usage_table

Unnamed: 0_level_0,TRAV1-1,TRAV1-2,TRAV10,TRAV12-1,TRAV12-2,TRAV12-3,TRAV13-1,TRAV13-2,TRAV14/DV4,TRAV16,...,TRAV5,TRAV6,TRAV7,TRAV8-1,TRAV8-2,TRAV8-3,TRAV8-4,TRAV8-6,TRAV9-1,TRAV9-2
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LD0001,0.010435,0.021739,0.016348,0.038261,0.064522,0.035130,0.057913,0.022957,0.032522,0.015652,...,0.013739,0.013565,0.000000,0.018609,0.010435,0.029391,0.041913,0.027304,0.000000,0.071130
LD0002,0.007591,0.024957,0.012996,0.047039,0.043243,0.027947,0.057504,0.023117,0.021507,0.013686,...,0.008741,0.008741,0.000000,0.011041,0.010351,0.024497,0.035998,0.027602,0.000115,0.060610
LD0003,0.016874,0.029601,0.011726,0.034034,0.081653,0.038467,0.062491,0.019305,0.037609,0.014729,...,0.014729,0.008723,0.000000,0.010439,0.014443,0.021879,0.040040,0.026455,0.000000,0.047190
LD0006,0.013472,0.032537,0.013981,0.038638,0.039654,0.031266,0.055796,0.026563,0.021479,0.018683,...,0.012201,0.016904,0.000000,0.015252,0.012583,0.024657,0.037367,0.027199,0.000127,0.052364
LD0007,0.008087,0.021781,0.012292,0.043994,0.037956,0.029761,0.074725,0.030192,0.039250,0.014880,...,0.012939,0.013910,0.000000,0.012292,0.011861,0.026310,0.038818,0.019625,0.000000,0.074294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LD1357,0.017994,0.052568,0.016175,0.038617,0.058027,0.033158,0.056814,0.024464,0.019814,0.013951,...,0.014153,0.007885,0.000000,0.014355,0.011524,0.020421,0.039224,0.033765,0.000000,0.059644
LD1361,0.007784,0.015450,0.020050,0.038802,0.034320,0.030310,0.076660,0.040807,0.030310,0.019342,...,0.014506,0.013445,0.000000,0.009317,0.012737,0.025593,0.036207,0.031608,0.000000,0.068522
LD1362,0.010617,0.052726,0.009358,0.047688,0.046068,0.036890,0.064963,0.024474,0.027893,0.018355,...,0.019975,0.014216,0.000000,0.011337,0.010257,0.025553,0.043549,0.028613,0.000000,0.042829
LD1364,0.011400,0.024066,0.019474,0.031032,0.054782,0.031666,0.052407,0.021849,0.025966,0.015358,...,0.015833,0.015358,0.000158,0.011400,0.007600,0.023908,0.035624,0.025016,0.000000,0.044332


In [12]:
set(usage_table.index) == set(HLA_table.index)

True

In [13]:
HLA_table

Unnamed: 0_level_0,DQA1*01:01,DQA1*01:02,DQA1*01:03,DQA1*01:04,DQA1*01:05,DQA1*02:01,DQA1*03:01,DQA1*03:02,DQA1*03:03,DQA1*04:01,...,DQB1*04:02,DQB1*05:01,DQB1*05:02,DQB1*05:03,DQB1*05:04,DQB1*06:01,DQB1*06:02,DQB1*06:03,DQB1*06:04,DQB1*06:09
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LD0014,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LD0041,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
LD0038,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
LD0084,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
LD0022,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LD1282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LD1271,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
LD1252,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
LD0165,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
def do_ols(HLA_table, usage_table, ind_var, covariates):
    
    if covariates:
        cov_df = pd.read_csv("DGN_covariates_df.csv").set_index("patid")
        cov_df["lane"] = cov_df["fcid"] + "_" + cov_df["lane"].astype("str") # covariates
        cov_df = pd.get_dummies(cov_df, columns=['fcid', 'lane'])
        pat_info_df = pd.merge(cov_df, HLA_table, left_index=True, right_index=True, how="inner")
        
        fcid_columns = [x for x in pat_info_df.columns if x.startswith("fcid")]
        lane_columns = [x for x in pat_info_df.columns if x.startswith("lane")]
    else:
        pat_info_df = HLA_table

    # merge on index (patid)
    all_df = pd.merge(pat_info_df, usage_table, left_index=True, right_index=True, how="inner")
    
    ind_var_col_name = ind_var
    all_df[ind_var_col_name] = all_df[ind_var_col_name].astype("float")

    print("doing {} tests. out of {} rows, {} are {} > 0".format(len(usage_table.columns), len(all_df), len(all_df[all_df[ind_var_col_name]>0]), ind_var))
    
    if covariates:
        x_columns = [ind_var] + fcid_columns# + lane_columns # if choose lane, no need fcid.
    else:
        x_columns = [ind_var]

    res_df = pd.DataFrame()
    for family in usage_table.columns:
        y_column = family
        X = all_df[x_columns]
        y = all_df[y_column]
        X = sm.add_constant(X) # constant is always added
        mod = sm.OLS(y, X)
        res = mod.fit()

        family_df = pd.DataFrame()
        family_df['coef'] = res.params
        family_df['se'] = res.bse
        family_df['pvalue'] = res.pvalues
        family_df['tvalue'] = res.tvalues
        res_df[family] = family_df.loc[ind_var_col_name]
    
    res_df = res_df.transpose().reset_index().rename(columns={"index":"family"}).sort_values("pvalue", ascending=True)
    # FDR correction
    res_df['pvalue_fdr'] = fdrcorrection(res_df['pvalue'])[1]
    return res_df

In [15]:
def do_ols(pat_info_df, usage_table, ind_var, y_cols, cov_cols):
    # ind_var: independent variable for model, e.g. 'DQB1*03:01'
    all_df = pd.merge(pat_info_df, usage_table, right_index=True, left_index=True, how="inner")
#     if binary: # code variable as 0 1
#         ind_var_col_name = "{}_binary".format(ind_var)
#         all_df[ind_var_col_name] = (all_df[ind_var] > 0).astype("float")
#     else: # code variable as 0 1 2
    ind_var_col_name = ind_var
    #all_df[ind_var_col_name] = all_df[ind_var_col_name].astype("float")
    
    #print("doing {} tests. out of {} rows, {} are {} > 0".format(len(y_cols), len(all_df), len(all_df[all_df[ind_var]>0]), ind_var))
    x_columns = [ind_var_col_name] + cov_cols
    res_df = pd.DataFrame()
    for y_column in y_cols:
        X = all_df[x_columns]
        y = all_df[y_column]
        X = sm.add_constant(X) # constant is always added
        mod = sm.OLS(y, X)
        res = mod.fit()
        #print(res.summary())
        tcr_df = pd.DataFrame()
        tcr_df['coef'] = res.params
        tcr_df['se'] = res.bse
        tcr_df['pvalue'] = res.pvalues
        tcr_df['tvalue'] = res.tvalues
        res_df[y_column] = tcr_df.loc[ind_var_col_name]
    res_df = res_df.transpose().reset_index().rename(columns={"index":"family"}).sort_values("pvalue", ascending=True)
    # FDR correction
    res_df['pvalue_fdr'] = fdrcorrection(res_df['pvalue'])[1]
    return res_df

In [16]:
# def do_stepwise_ols(HLA_df, family_counts_df, cov_df, ind_var, cov_cols, want_families={}, fdr=True):
    
#     def is_continue(df):
#         if len(want_families) > 0: # need results for all families in want_families
#             print(set(df['family']).intersection(want_families), want_families)
#             if set(df['family']).intersection(want_families) == want_families:
#                 return False
#         return True
    
#     pat_info_df = pd.merge(cov_df, HLA_df, right_index=True, left_index=True, how="inner")
#     df = pd.DataFrame(columns=["family", "coef", "se", "pvalue", "tvalue"])
#     drop_families = []

#     while is_continue(df):
#         _, usage_table = family_counts_df_to_usage_df(family_counts_df, drop_families=drop_families)
#         tcr_columns = [x for x in usage_table.columns if x.startswith("TR")]
#         res_df = do_ols(pat_info_df, usage_table, ind_var, tcr_columns, cov_cols)
#         top_row = res_df.iloc[0]
#         if fdr:
#             pvalue = top_row['pvalue_fdr']
#         else:
#             pvalue = top_row['pvalue']
#         print(top_row['family'], pvalue)
#         if len(want_families) == 0:
#             if pvalue > 0.05: # get all rows until the top row is no longer significant
#                 break
#         drop_families.append(top_row['family'])
#         df = df.append(top_row)
        
#     df['coef'] = df['coef']*100 # want the coef to be in percentages
#     return df

In [17]:
def do_stepwise_ols(HLA_table, counts_table, ind_var, want_families={}, fdr=True):
    
    def is_continue(df):
        if len(want_families) > 0: # need results for all families in want_families
            print(set(df['family']).intersection(want_families), want_families)
            if set(df['family']).intersection(want_families) == want_families:
                return False
        return True
    
    df = pd.DataFrame(columns=["family", "coef", "se", "pvalue", "tvalue"])
    drop_columns = []

    while is_continue(df):
        usage_table = counts_to_usage_table(counts_table, drop_columns=drop_columns)
        if len(usage_table.columns) == 0: # dropped all the columns already, nothing left
            break
        res_df = do_ols(HLA_table, usage_table, ind_var, usage_table.columns, [])
        top_row = res_df.iloc[0]
        if fdr:
            pvalue = top_row['pvalue_fdr']
        else:
            pvalue = top_row['pvalue']
        if len(want_families) == 0:
            if pvalue > 0.1: # get all rows until the top row is no longer significant
                break
        print("{}, coef: {}, pvalue: {}".format(top_row['family'], top_row['coef'], pvalue))
        drop_columns.append(top_row['family'])
        df = df.append(top_row)
    df['coef'] = df['coef']*100 # want the coef to be in percentages
    return df

In [18]:
# #usage_df['usage_ratio'] = usage_df['usage_ratio']*100 # only do once!
# cov_df = pd.read_csv("DGN_covariates_df.csv").set_index("patid")
# cov_df["lane"] = cov_df["fcid"] + "_" + cov_df["lane"].astype("str") # covariates
# cov_df = pd.get_dummies(cov_df, columns=['fcid', 'lane'])
# pat_info_df = pd.merge(cov_df, HLA_table, right_index=True, left_index=True, how="inner")

In [19]:
# step_res_df_301_all = do_stepwise_ols(HLA_table, family_counts_table, ind_var="DQB1*03:01", fdr=True)

# step_res_df_301_all

In [20]:
# tcr_columns = list(usage_table.columns)
# do_ols(HLA_table, usage_table, ind_var="DQB1*03:01", y_cols=tcr_columns, cov_cols=[])

In [21]:
HLA_table[(HLA_table['DQA1*03:03'] == 1) | (HLA_table['DQA1*05:05'] == 1)]

Unnamed: 0_level_0,DQA1*01:01,DQA1*01:02,DQA1*01:03,DQA1*01:04,DQA1*01:05,DQA1*02:01,DQA1*03:01,DQA1*03:02,DQA1*03:03,DQA1*04:01,...,DQB1*04:02,DQB1*05:01,DQB1*05:02,DQB1*05:03,DQB1*05:04,DQB1*06:01,DQB1*06:02,DQB1*06:03,DQB1*06:04,DQB1*06:09
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LD0033,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
LD0008,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LD0023,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
LD0011,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
LD0058,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LD1148,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
LD1357,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
LD1291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LD1282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
print("everyone is either DQA1*03:03 or DQA1*05:05")
ind_var = "DQA1*05:05" # "DQB1*03:01"
print("indicator variable is {}".format(ind_var))
# for filt_col in ['DQA1*05:05', 'DQA1*03:03']:
print("\n")
HLA_table_filt = HLA_table[(HLA_table['DQA1*03:03'] == 1) | (HLA_table['DQA1*05:05'] == 1)]
# HLA_table_filt = HLA_table[~((HLA_table['DQB1*03:01'] == 1) & (HLA_table[filt_col] != 1))]
# HLA_table_filt = HLA_table[HLA_table[filt_col] == 1.0] # HLA_table[HLA_table['DQA1*03:03'] == 1.0]
# HLA_table_filt = HLA_table

## CDR1
TRAV_active_positions = [0, 1, 2, 3, 9, 10, 11]
TRBV_active_positions = [0, 1, 2, 9, 10, 11]

if TRV == "TRAV":
    TRV_active_positions = TRAV_active_positions
else:
    TRV_active_positions = TRBV_active_positions

print(TRV, "CDR1")
#print("everyone with {} is also {}".format(ind_var, filt_col))
for i in TRV_active_positions:
    print("--- position {} ---".format(i+27))
    do_stepwise_ols(HLA_table_filt, positions_AA_count_dfs[i], ind_var=ind_var, fdr=True)

print("\n")
## CDR2
TRAV_active_positions = [0, 1, 2, 3, 6, 7, 8, 9]
TRBV_active_positions = [0, 1, 2, 3, 7, 8, 9]

if TRV == "TRAV":
    TRV_active_positions = TRAV_active_positions
else:
    TRV_active_positions = TRBV_active_positions

print(TRV, "CDR2")
#print("everyone with {} is also {}".format(ind_var, filt_col))
for i in TRV_active_positions:
    print("--- position {} ---".format(i+56))
    # +12 to offset the CDR1 AA's in front
    do_stepwise_ols(HLA_table_filt, positions_AA_count_dfs[i+12], ind_var=ind_var, fdr=True)

everyone is either DQA1*03:03 or DQA1*05:05
indicator variable is DQA1*05:05


TRAV CDR1
--- position 27 ---
--- position 28 ---
--- position 29 ---
--- position 30 ---
N, coef: 0.0014376021889963074, pvalue: 0.06233991382227777
--- position 36 ---
--- position 37 ---
--- position 38 ---


TRAV CDR2
--- position 56 ---
--- position 57 ---
S, coef: -0.004376904345392034, pvalue: 0.024144195939812523
P, coef: -0.0012395274792675866, pvalue: 0.09380029598287838
--- position 58 ---
--- position 59 ---
N, coef: 0.006041488767079005, pvalue: 0.05656476386802411
--- position 62 ---
--- position 63 ---
--- position 64 ---
R, coef: 0.006679619332897114, pvalue: 0.0010257058729448954
--- position 65 ---


In [52]:
step_family_res_df = do_stepwise_ols(HLA_table, family_counts_table, "DQB1*03:01")
step_family_res_df

Unnamed: 0,family,coef,se,pvalue,tvalue


In [None]:
family_counts_table.columns

In [None]:
family_usage_table = counts_to_usage_table(family_counts_table)

In [None]:
family_res_df = do_ols(HLA_table, family_usage_table, "DQB1*03:01", covariates=True, binary=True)

In [None]:
cond_A = HLA_table['DQB1*03:01'] > 0
cond_B = HLA_table['DQB1*03:02'] > 0
filt_HLA_table = HLA_table[(cond_A & ~cond_B) | (cond_B & ~cond_A)]
step_res_df_301_302 = do_stepwise_ols(filt_HLA_table, family_counts_table, "DQB1*03:01", fdr=True, covariates=False, binary=True)

In [None]:
step_res_df_301_302

In [None]:
cond_A = HLA_table['DQB1*03:01'] > 0
cond_B = HLA_table['DQB1*06:01'] > 0
filt_HLA_table = HLA_table[(cond_A & ~cond_B) | (cond_B & ~cond_A)]
step_res_df_301_302 = do_stepwise_ols(filt_HLA_table, family_counts_table, "DQB1*03:01", fdr=True, covariates=False, binary=True)