In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection

In [2]:
# family counts to usage
def family_counts_df_to_usage_df(family_counts_df, drop_families=[]):
    
    # drop any families (so that they are not included in total count)
    if len(drop_families) > 0:
        family_counts_df = family_counts_df[~family_counts_df['family'].isin(drop_families)]

#     # Filter out pseudogenes and non-functional genes
#     valid_TCR_genes = set()
#     with open("valid_TCR_genes.txt", "r") as f:
#         for line in f:
#             valid_TCR_genes.add(line.strip())
#     valid_TCR_genes.add("TRAV8-5")
#     df = df[df['family'].isin(valid_TCR_genes)]
    
    # get total counts for every patient, every family (adding subtype counts together)
    family_counts_df = family_counts_df.groupby(["patid", "family"]).sum().rename(columns={"count":"family_count"}).reset_index()

    # take out the segment (AJ) out of the family (TRAJ1), and groupby patid+segment to get total AJ, total BJ, AV, BV for each patient
    family_counts_df['segment'] = family_counts_df['family'].str.slice(start=2, stop=4)
    segment_counts_df = family_counts_df.groupby(["patid", "segment"]).sum().rename(columns={"family_count":"segment_count"}).reset_index()

    # merge the family count and segment count dfs, allowing us to calculate usage ratio
    df_with_total = pd.merge(family_counts_df, segment_counts_df, how="inner", on=["patid", "segment"])
    df_with_total["usage_ratio"] = df_with_total["family_count"]/df_with_total["segment_count"]
    usage_df = df_with_total[["patid", "family", "usage_ratio"]]

    # make both long and wide formats for the usage_ratio table
    usage_table = usage_df.pivot(index="patid", columns="family", values="usage_ratio").fillna(0)
    usage_table = usage_table.reindex(sorted(usage_table.columns), axis=1).fillna(0)
    # usage_table = usage_table.reset_index()

#     # only want patids in Sharon's paper, N=895, due to quality control
#     sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
#     sharon_counts_df = sharon_counts_df.reset_index()
#     sharon_ids = set(sharon_counts_df.columns[1:])
#     usage_table = usage_table[usage_table['patid'].isin(sharon_ids)].set_index("patid")
    return usage_df, usage_table

In [3]:
def do_ols(pat_info_df, usage_table, ind_var, y_cols, cov_cols):
    # ind_var: independent variable for model, e.g. 'DQB1*03:01'
    all_df = pd.merge(pat_info_df, usage_table, right_index=True, left_index=True, how="inner")
#     if binary: # code variable as 0 1
#         ind_var_col_name = "{}_binary".format(ind_var)
#         all_df[ind_var_col_name] = (all_df[ind_var] > 0).astype("float")
#     else: # code variable as 0 1 2
    ind_var_col_name = ind_var
    #all_df[ind_var_col_name] = all_df[ind_var_col_name].astype("float")
    
    print("doing {} tests. out of {} rows, {} are {} > 0".format(len(y_cols), len(all_df), len(all_df[all_df[ind_var]>0]), ind_var))
    x_columns = [ind_var_col_name] + cov_cols
    res_df = pd.DataFrame()
    for y_column in y_cols:
        X = all_df[x_columns]
        y = all_df[y_column]
        X = sm.add_constant(X) # constant is always added
        mod = sm.OLS(y, X)
        res = mod.fit()
        #print(res.summary())
        tcr_df = pd.DataFrame()
        tcr_df['coef'] = res.params
        tcr_df['se'] = res.bse
        tcr_df['pvalue'] = res.pvalues
        tcr_df['tvalue'] = res.tvalues
        res_df[y_column] = tcr_df.loc[ind_var_col_name]

    res_df = res_df.transpose().reset_index().rename(columns={"index":"TCR"}).sort_values("pvalue", ascending=True)
    # FDR correction
    res_df['pvalue_fdr'] = fdrcorrection(res_df['pvalue'])[1]
    return res_df

In [4]:
def do_stepwise_ols(HLA_df, family_counts_df, cov_df, ind_var, cov_cols, want_families={}, fdr=True):
    
    def is_continue(df):
        if len(want_families) > 0: # need results for all families in want_families
            print(set(df['TCR']).intersection(want_families), want_families)
            if set(df['TCR']).intersection(want_families) == want_families:
                return False
        return True
    
    pat_info_df = pd.merge(cov_df, HLA_df, right_index=True, left_index=True, how="inner")
    df = pd.DataFrame(columns=["TCR", "coef", "se", "pvalue", "tvalue"])
    drop_families = []

    while is_continue(df):
        _, usage_table = family_counts_df_to_usage_df(family_counts_df, drop_families=drop_families)
        tcr_columns = [x for x in usage_table.columns if x.startswith("TR")]
        res_df = do_ols(pat_info_df, usage_table, ind_var, tcr_columns, cov_cols)
        top_row = res_df.iloc[0]
        if fdr:
            pvalue = top_row['pvalue_fdr']
        else:
            pvalue = top_row['pvalue']
        print(top_row['TCR'], pvalue)
        if len(want_families) == 0:
            if pvalue > 0.05: # get all rows until the top row is no longer significant
                break
        drop_families.append(top_row['TCR'])
        df = df.append(top_row)
        
    df['coef'] = df['coef']*100 # want the coef to be in percentages
    return df

In [5]:
HLA_df = pd.read_csv("DGN_HLA_df.csv").set_index("patid").apply(lambda x: x > 0).astype("float")
family_counts_df = pd.read_csv("DGN_family_counts_df.csv")
usage_df, usage_table = family_counts_df_to_usage_df(family_counts_df, drop_families=[])

In [7]:
usage_table.to_csv("DGN_vdj_usages_table.csv")

In [10]:
usage_table

family,TRAJ1,TRAJ10,TRAJ11,TRAJ12,TRAJ13,TRAJ14,TRAJ15,TRAJ16,TRAJ17,TRAJ18,...,TRBV6-9,TRBV7-1,TRBV7-2,TRBV7-3,TRBV7-4,TRBV7-6,TRBV7-7,TRBV7-8,TRBV7-9,TRBV9
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LD0001,0.001674,0.030962,0.010879,0.011715,0.022594,0.000000,0.016736,0.015063,0.018410,0.026778,...,0.000000,0.000107,0.061995,0.020650,0.001467,0.007848,0.003834,0.017484,0.053554,0.023116
LD0002,0.003399,0.028895,0.007932,0.015297,0.028329,0.000567,0.015297,0.011898,0.022096,0.017564,...,0.000241,0.000723,0.051981,0.025215,0.000841,0.009699,0.003156,0.018867,0.058738,0.030229
LD0003,0.004222,0.041520,0.009852,0.010556,0.021816,0.000000,0.022519,0.011260,0.018297,0.017593,...,0.000480,0.000397,0.053907,0.007447,0.002127,0.009227,0.004666,0.015779,0.063885,0.030974
LD0006,0.000546,0.021834,0.010917,0.014192,0.024563,0.000546,0.015830,0.015284,0.014738,0.025655,...,0.000379,0.000229,0.068104,0.014386,0.003442,0.008145,0.001910,0.017500,0.052322,0.018557
LD0007,0.002125,0.032412,0.009564,0.013815,0.023379,0.001063,0.024973,0.010627,0.016472,0.020723,...,0.000098,0.000680,0.079739,0.014004,0.001316,0.007673,0.001427,0.015487,0.055073,0.016142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LD1357,0.001005,0.030151,0.013065,0.018090,0.034171,0.001005,0.019095,0.008040,0.014070,0.018090,...,0.000170,0.000203,0.066791,0.022862,0.001054,0.010861,0.003113,0.017422,0.045393,0.032327
LD1361,0.002919,0.028021,0.015762,0.008757,0.030940,0.000584,0.023351,0.009924,0.017513,0.020432,...,0.000581,0.001454,0.049248,0.014647,0.000872,0.008465,0.001781,0.018135,0.060000,0.018176
LD1362,0.004488,0.024237,0.008977,0.020646,0.029623,0.000000,0.024237,0.013465,0.026032,0.017056,...,0.000228,0.000283,0.037762,0.012010,0.001198,0.008214,0.001566,0.017206,0.068910,0.018973
LD1364,0.001487,0.031227,0.009665,0.012639,0.023048,0.000000,0.017100,0.015613,0.024535,0.016357,...,0.000420,0.000715,0.079908,0.010327,0.001330,0.010239,0.010079,0.024352,0.059862,0.028590


In [35]:
tcr_columns = list(usage_table.columns)
do_ols(HLA_df, usage_table, ind_var="DQB1*03:01", y_cols=tcr_columns, cov_cols=[])

doing 186 tests. out of 895 rows, 306 are DQB1*03:01 > 0


Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr
125,TRBV10-3,1.742681e-03,0.000390,0.000009,4.470304,0.001640
68,TRAV13-1,3.225952e-03,0.000848,0.000151,3.805336,0.009423
75,TRAV2,-1.494047e-03,0.000393,0.000152,-3.804154,0.009423
58,TRAJ7,1.065851e-03,0.000336,0.001541,3.176725,0.071639
131,TRBV12-3,2.467902e-03,0.000823,0.002799,2.997278,0.093100
...,...,...,...,...,...,...
173,TRBV6-6,-5.243835e-06,0.000291,0.985635,-0.018009,0.998629
90,TRAV36/DV7,3.396931e-06,0.000243,0.988870,0.013954,0.998629
120,TRBJ2-6,7.784565e-07,0.000362,0.998285,0.002150,0.998629
41,TRAJ47,-8.436493e-07,0.000410,0.998358,-0.002059,0.998629


In [28]:
HLA_df

Unnamed: 0_level_0,DQA1*01:01,DQA1*01:02,DQA1*01:03,DQA1*01:04,DQA1*01:05,DQA1*02:01,DQA1*03:01,DQA1*03:02,DQA1*03:03,DQA1*04:01,...,DQB1*04:02,DQB1*05:01,DQB1*05:02,DQB1*05:03,DQB1*05:04,DQB1*06:01,DQB1*06:02,DQB1*06:03,DQB1*06:04,DQB1*06:09
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LD0014,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LD0041,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
LD0038,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
LD0084,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
LD0022,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LD1282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LD1271,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
LD1252,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
LD0165,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
usage_table.to_csv("DGN_family_usage_table.csv")

In [27]:
usage_table[[x for x in usage_table.columns if x.startswith("TRAV")]]

family,TRAV1-1,TRAV1-2,TRAV10,TRAV11,TRAV12-1,TRAV12-2,TRAV12-3,TRAV13-1,TRAV13-2,TRAV14/DV4,...,TRAV6,TRAV7,TRAV8-1,TRAV8-2,TRAV8-3,TRAV8-4,TRAV8-6,TRAV8-7,TRAV9-1,TRAV9-2
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LD0001,0.010295,0.021812,0.016314,0.000000,0.038181,0.063967,0.035478,0.057792,0.022909,0.032454,...,0.013537,0.000000,0.018570,0.014219,0.032107,0.037854,0.027241,0.000000,0.000000,0.070982
LD0002,0.007578,0.024917,0.012975,0.000000,0.046963,0.042815,0.028146,0.057412,0.023080,0.021472,...,0.008841,0.000000,0.011023,0.016775,0.026524,0.027740,0.029317,0.000000,0.000115,0.060512
LD0003,0.016812,0.029776,0.011825,0.000000,0.033908,0.081108,0.038567,0.062260,0.019234,0.037470,...,0.009118,0.000000,0.010400,0.025801,0.024932,0.026920,0.028060,0.000000,0.000000,0.047015
LD0006,0.013423,0.032797,0.013929,0.000000,0.038496,0.039528,0.031132,0.055591,0.026466,0.021401,...,0.016842,0.000000,0.015196,0.019957,0.026961,0.030132,0.027282,0.000138,0.000127,0.052172
LD0007,0.008063,0.021716,0.012255,0.000108,0.043754,0.038092,0.029420,0.074500,0.030101,0.039131,...,0.014190,0.000000,0.012255,0.018065,0.028704,0.031424,0.020604,0.000000,0.000000,0.074070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LD1357,0.017811,0.052623,0.016145,0.000000,0.038547,0.057069,0.033950,0.056710,0.024420,0.019778,...,0.007871,0.000000,0.014329,0.018187,0.022200,0.033542,0.032631,0.000000,0.000000,0.059536
LD1361,0.007767,0.015415,0.020005,0.000118,0.038715,0.034262,0.030224,0.076489,0.040715,0.030242,...,0.013415,0.000000,0.009296,0.016958,0.027506,0.032068,0.031345,0.000500,0.000000,0.068251
LD1362,0.010596,0.053520,0.009339,0.000180,0.047414,0.046343,0.036452,0.064835,0.024425,0.027838,...,0.014188,0.000000,0.011315,0.018723,0.026760,0.033850,0.029683,0.000000,0.000000,0.042744
LD1364,0.011364,0.023990,0.019413,0.000158,0.030934,0.054637,0.031221,0.052241,0.021780,0.025884,...,0.015309,0.000158,0.011364,0.010885,0.026179,0.031134,0.026005,0.000178,0.000000,0.044192


In [7]:
mean_usages = usage_df.groupby("family").mean().rename(columns={"usage_ratio":"usage_percent_mean"}).reset_index()
std_usages = usage_df.groupby("family").std().rename(columns={"usage_ratio":"usage_percent_std"}).reset_index()
mean_std_usages = pd.merge(mean_usages, std_usages, on="family", how="inner")

In [8]:
# vdj_long.to_csv("DGN_usage_df_nopseudo.csv", index=None)
# vdj_table.to_csv("DGN_usage_table_nopseudo.csv")

In [9]:
usage_df['usage_ratio'] = usage_df['usage_ratio']*100 # only do once!
cov_df = pd.read_csv("DGN_covariates_df.csv").set_index("patid")
cov_df["lane"] = cov_df["fcid"] + "_" + cov_df["lane"].astype("str") # covariates
cov_df = pd.get_dummies(cov_df, columns=['fcid', 'lane'])
pat_info_df = pd.merge(cov_df, HLA_df, right_index=True, left_index=True, how="inner")

In [10]:
tcr_columns = [x for x in usage_table.columns if x.startswith("TR")]
fcid_columns = [x for x in cov_df.columns if x.startswith("fcid")]
lane_columns = [x for x in cov_df.columns if x.startswith("lane")]

In [11]:
# usage_path = "/home/ashteng/TCR_usages/DGN_vdj_usages_table.csv"
# vdj_table.to_csv(usage_path, index=None)

In [12]:
step_res_df_301_all = do_stepwise_ols(HLA_df, family_counts_df[family_counts_df['family'].str.startswith("TRAV")], cov_df, ind_var="DQB1*03:01", cov_cols=[], fdr=True)

doing 47 tests. out of 895 rows, 306 are DQB1*03:01 > 0


  return ptp(axis=axis, out=out, **kwargs)


TRAV13-1 0.0035714529432026272
doing 46 tests. out of 895 rows, 306 are DQB1*03:01 > 0
TRAV2 0.018241695588721296
doing 45 tests. out of 895 rows, 306 are DQB1*03:01 > 0
TRAV8-4 0.05629938972031897


In [13]:
step_res_df_301_all

Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr
7,TRAV13-1,0.322595,0.000848,0.000151,3.805336,0.003571
13,TRAV2,-0.150786,0.000424,0.000397,-3.555845,0.018242


In [14]:

step_res_df_301_all_expanded = pd.merge(step_res_df_301_all, mean_std_usages, left_on="TCR", right_on="family", how="inner")

In [15]:
step_res_df_301_all_expanded

Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr,family,usage_percent_mean,usage_percent_std
0,TRAV13-1,0.322595,0.000848,0.000151,3.805336,0.003571,TRAV13-1,0.065558,0.012121
1,TRAV2,-0.150786,0.000424,0.000397,-3.555845,0.018242,TRAV2,0.024636,0.005615


In [17]:
res_df_301_all = do_ols(pat_info_df, usage_table, ind_var="DQB1*03:01", y_cols=tcr_columns, cov_cols=[])

doing 186 tests. out of 895 rows, 306 are DQB1*03:01 > 0


In [18]:
res_df_301_all_expanded = pd.merge(res_df_301_all, mean_std_usages, left_on="TCR", right_on="family", how="inner")

In [19]:
res_df_301_all_expanded.head(10)

Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr,family,usage_percent_mean,usage_percent_std
0,TRBV10-3,0.001743,0.00039,9e-06,4.470304,0.00164,TRBV10-3,0.021001,0.00559
1,TRAV13-1,0.003226,0.000848,0.000151,3.805336,0.009423,TRAV13-1,0.065558,0.012121
2,TRAV2,-0.001494,0.000393,0.000152,-3.804154,0.009423,TRAV2,0.024636,0.005615
3,TRAJ7,0.001066,0.000336,0.001541,3.176725,0.071639,TRAJ7,0.008898,0.004785
4,TRBV12-3,0.002468,0.000823,0.002799,2.997278,0.0931,TRBV12-3,0.031109,0.011736
5,TRAV16,-0.001092,0.000376,0.00377,-2.904462,0.0931,TRAV16,0.016834,0.005357
6,TRAV12-1,-0.002986,0.001029,0.003791,-2.902633,0.0931,TRAV12-1,0.041559,0.014658
7,TRAV8-4,0.001495,0.000525,0.004475,2.84979,0.0931,TRAV8-4,0.029759,0.007473
8,TRAJ48,-0.00137,0.000487,0.004976,-2.815621,0.0931,TRAJ48,0.021129,0.006933
9,TRAV8-3,-0.000976,0.000348,0.005137,-2.805248,0.0931,TRAV8-3,0.024638,0.004957


In [None]:
HLA_df[[x for x in HLA_df.columns if x.startswith("DQA")]].apply(lambda x: x > 0).astype("int").sum()

In [None]:
#top_TCR_families = set(step_res_df_301_all['TCR'].values)

In [None]:
# filt_HLA_df = HLA_df[HLA_df["HLA_DQA1_0101"] > 0.5]
# step_res_df_301_all_DQA10101 = do_stepwise_ols(filt_HLA_df, counts_df, "HLA_DQB1_0301", want_families=top_TCR_families, fdr=False)

In [None]:
# filt_HLA_df = HLA_df[HLA_df["HLA_DQA1_0102"] > 0.5]
# step_res_df_301_all_DQA10102 = do_stepwise_ols(filt_HLA_df, counts_df, "HLA_DQB1_0301", want_families=top_TCR_families, fdr=False)

In [None]:
# filt_HLA_df = HLA_df[HLA_df["HLA_DQA1_0501"] > 0.5]
# step_res_df_301_all_DQA10501 = do_stepwise_ols(filt_HLA_df, counts_df, "HLA_DQB1_0301", want_families=top_TCR_families, fdr=False)

In [None]:
# DQA_columns = [col for col in HLA_df.columns if col.startswith("DQA1")]
# DQB_columns = [col for col in HLA_df.columns if col.startswith("DQB1")]

# HLA_df[HLA_df["DQB1*03:01"] > 0][DQB_columns].sum()

# HLA_df[HLA_df["DQB1*03:01"] == 0][DQB_columns].sum()

In [21]:
cond_A = HLA_df['DQB1*03:01'] > 0
cond_B = HLA_df['DQB1*03:02'] > 0
filt_HLA_df = HLA_df[(cond_A & ~cond_B) | (cond_B & ~cond_A)]
step_res_df_301_302 = do_stepwise_ols(filt_HLA_df, family_counts_df[family_counts_df['family'].str.startswith("TRAV")], cov_df, ind_var="DQB1*03:01", cov_cols=[], fdr=True)
step_res_df_301_302_expanded = pd.merge(step_res_df_301_302, mean_std_usages, left_on="TCR", right_on="family", how="inner")

doing 47 tests. out of 418 rows, 266 are DQB1*03:01 > 0
TRAV29/DV5 0.002814456345069334
doing 46 tests. out of 418 rows, 266 are DQB1*03:01 > 0
TRAV23/DV6 0.008048747383468039
doing 45 tests. out of 418 rows, 266 are DQB1*03:01 > 0
TRAV26-1 0.1338729315246694


In [None]:
step_res_df_301_302_expanded

In [None]:
dqa_cols = [x for x in HLA_df.columns if x.startswith("DQA")]
step_res_df_301_all_cov_dqa = do_stepwise_ols(HLA_df, counts_df, cov_df, ind_var="DQB1*03:01", cov_cols=dqa_cols, fdr=True)

In [None]:
res_df_301_all_cov_dqa = do_ols(pat_info_df, vdj_table, ind_var="DQB1*03:01", y_cols=tcr_columns, cov_cols=['DQA1*05:05', 'DQA1*03:03', 'DQA1*06:01'])

In [None]:
res_df_301_all_cov_dqa

In [None]:
df1 = pd.merge(step_res_df_301_302, step_res_df_301_601, on="TCR", how="inner", suffixes=('_302', '_601'))
df2 = pd.merge(step_res_df_301_all, df1, on="TCR", how="inner")
res_df = df2[['TCR', 'coef', 'pvalue_fdr', 'coef_302', 'pvalue_302', 'coef_601', 'pvalue_601']]
header = [np.array(["", "03:01 vs non-03:01", "03:01 vs non-03:01", "03:01 vs 03:02", "03:01 vs 03:02", "03:01 vs 06:01", "03:01 vs 06:01"]), 
np.array(["TCR", "coef", "p-value (FDR)", "coef", "p-value (non-FDR)", "coef", "p-value (non-FDR)"])] 
res_df.columns = header

In [None]:
res_df.to_csv("3_group_coef_pvalues.csv")

In [None]:
num_rows, num_cols = len(res_df), 3
pvalues_mat = np.zeros((num_rows, num_cols))
coefs_mat = np.zeros((num_rows, num_cols))
tcrs = res_df['TCR'].values

suffixes = ['', '_601', '_302']

for i, suffix in enumerate(suffixes):
    pvalues = res_df['pvalue'+suffix].values
    coefs = res_df['coef'+suffix].values
    pvalues_mat[:,i] = pvalues
    coefs_mat[:,i] = coefs
    
# signed_log_pvalues_mat = -np.log(pvalues_mat)*np.where(coefs_mat > 0, 1, -1)

fig, (ax1) = plt.subplots(sharex=False, sharey=True, figsize=((pvalues_mat.shape[1]//2+3)*2, (pvalues_mat.shape[0]//2)*2))
fig.tight_layout(pad=3.0)
ax1 = sns.heatmap(coefs_mat, cmap='seismic', center=0, cbar_kws={'label': "coef"}, ax=ax1, annot=pvalues_mat)
ax1.set_title("DQB1*03:01 status predicting TCR family expression")
ax1.set_yticklabels(tcrs, rotation=0)
ax1.set_xticklabels(['ALL', '03:01 XOR 06:01', '03:01 XOR 03:02'])
plt.xlabel("Cohort")
plt.ylabel("TCR family")
bottom, top = ax1.get_ylim()
ax1.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
num_rows, num_cols = len(res_df), 3
pvalues_mat = np.zeros((num_rows, num_cols))
coefs_mat = np.zeros((num_rows, num_cols))
tcrs = res_df['TCR'].values

suffixes = ['', '_601', '_302']

for i, suffix in enumerate(suffixes):
    pvalues = res_df['pvalue'+suffix].values
    coefs = res_df['coef'+suffix].values
    pvalues_mat[:,i] = pvalues
    coefs_mat[:,i] = coefs
    
# signed_log_pvalues_mat = -np.log(pvalues_mat)*np.where(coefs_mat > 0, 1, -1)

fig, (ax1) = plt.subplots(sharex=False, sharey=True, figsize=((pvalues_mat.shape[1]//2+3)*2, (pvalues_mat.shape[0]//2)*2))
fig.tight_layout(pad=3.0)
ax1 = sns.heatmap(coefs_mat, cmap='seismic', center=0, cbar_kws={'label': "coef"}, ax=ax1, annot=pvalues_mat)
ax1.set_title("DQB1*03:01 status predicting TCR family expression")
ax1.set_yticklabels(tcrs, rotation=0)
ax1.set_xticklabels(['ALL', '03:01 XOR 06:01', '03:01 XOR 03:02'])
plt.xlabel("Cohort")
plt.ylabel("TCR family")
bottom, top = ax1.get_ylim()
ax1.set_ylim(bottom + 0.5, top - 0.5)