In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection

In [1]:
# family counts to usage
def family_counts_df_to_usage_df(family_counts_df, drop_families=[]):
    
    # drop any families (so that they are not included in total count)
    if len(drop_families) > 0:
        family_counts_df = family_counts_df[~family_counts_df['family'].isin(drop_families)]

#     # Filter out pseudogenes and non-functional genes
#     valid_TCR_genes = set()
#     with open("valid_TCR_genes.txt", "r") as f:
#         for line in f:
#             valid_TCR_genes.add(line.strip())
#     valid_TCR_genes.add("TRAV8-5")
#     df = df[df['family'].isin(valid_TCR_genes)]
    
    # get total counts for every patient, every family (adding subtype counts together)
    family_counts_df = family_counts_df.groupby(["patid", "family"]).sum().rename(columns={"count":"family_count"}).reset_index()

    # take out the segment (AJ) out of the family (TRAJ1), and groupby patid+segment to get total AJ, total BJ, AV, BV for each patient
    family_counts_df['segment'] = family_counts_df['family'].str.slice(start=2, stop=4)
    segment_counts_df = family_counts_df.groupby(["patid", "segment"]).sum().rename(columns={"family_count":"segment_count"}).reset_index()

    # merge the family count and segment count dfs, allowing us to calculate usage ratio
    df_with_total = pd.merge(family_counts_df, segment_counts_df, how="inner", on=["patid", "segment"])
    df_with_total["usage_ratio"] = df_with_total["family_count"]/df_with_total["segment_count"]
    usage_df = df_with_total[["patid", "family", "usage_ratio"]]

    # make both long and wide formats for the usage_ratio table
    usage_table = usage_df.pivot(index="patid", columns="family", values="usage_ratio").fillna(0)
    usage_table = usage_table.reindex(sorted(usage_table.columns), axis=1).fillna(0)
    usage_table = usage_table.reset_index()

#     # only want patids in Sharon's paper, N=895, due to quality control
#     sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
#     sharon_counts_df = sharon_counts_df.reset_index()
#     sharon_ids = set(sharon_counts_df.columns[1:])
#     usage_table = usage_table[usage_table['patid'].isin(sharon_ids)].set_index("patid")
    return usage_df, usage_table

In [2]:
def do_ols(pat_info_df, usage_table, ind_var, y_cols, cov_cols):
    # ind_var: independent variable for model, e.g. 'DQB1*03:01'
    all_df = pd.merge(pat_info_df, usage_table, right_index=True, left_index=True, how="inner")
#     if binary: # code variable as 0 1
#         ind_var_col_name = "{}_binary".format(ind_var)
#         all_df[ind_var_col_name] = (all_df[ind_var] > 0).astype("float")
#     else: # code variable as 0 1 2
    ind_var_col_name = ind_var
    #all_df[ind_var_col_name] = all_df[ind_var_col_name].astype("float")
    
    print("doing {} tests. out of {} rows, {} are {} > 0".format(len(y_cols), len(all_df), len(all_df[all_df[ind_var]>0]), ind_var))
    x_columns = [ind_var_col_name] + cov_cols
    res_df = pd.DataFrame()
    for y_column in y_cols:
        X = all_df[x_columns]
        y = all_df[y_column]
        X = sm.add_constant(X) # constant is always added
        mod = sm.OLS(y, X)
        res = mod.fit()
        #print(res.summary())
        tcr_df = pd.DataFrame()
        tcr_df['coef'] = res.params
        tcr_df['se'] = res.bse
        tcr_df['pvalue'] = res.pvalues
        tcr_df['tvalue'] = res.tvalues
        res_df[y_column] = tcr_df.loc[ind_var_col_name]

    res_df = res_df.transpose().reset_index().rename(columns={"index":"TCR"}).sort_values("pvalue", ascending=True)
    # FDR correction
    res_df['pvalue_fdr'] = fdrcorrection(res_df['pvalue'])[1]
    return res_df

In [5]:
def do_stepwise_ols(HLA_df, counts_df, cov_df, ind_var, cov_cols, want_families={}, fdr=True):
    
    def is_continue(df):
        if len(want_families) > 0: # need results for all families in want_families
            print(set(df['TCR']).intersection(want_families), want_families)
            if set(df['TCR']).intersection(want_families) == want_families:
                return False
        return True
    
    pat_info_df = pd.merge(cov_df, HLA_df, right_index=True, left_index=True, how="inner")
    df = pd.DataFrame(columns=["TCR", "coef", "se", "pvalue", "tvalue"])
    drop_families = []

    while is_continue(df):
        _, usage_table = counts_to_usage_df(counts_df, drop_families=drop_families)
        tcr_columns = [x for x in vdj_table.columns if x.startswith("TR")]
        res_df = do_ols(pat_info_df, usage_table, ind_var, tcr_columns, cov_cols)
        top_row = res_df.iloc[0]
        if fdr:
            pvalue = top_row['pvalue_fdr']
        else:
            pvalue = top_row['pvalue']
        print(top_row['TCR'], pvalue)
        if len(want_families) == 0:
            if pvalue > 0.05: # get all rows until the top row is no longer significant
                break
        drop_families.append(top_row['TCR'])
        df = df.append(top_row)
        
    df['coef'] = df['coef']*100 # want the coef to be in percentages
    return df

In [16]:
HLA_df = pd.read_csv("DGN_HLA_df.csv").set_index("patid").apply(lambda x: x > 0).astype("float")
# counts_df = pd.read_csv("DGN_transcript_counts_df.csv", index_col=0, low_memory=False)
# counts_df = counts_df[(counts_df['transcript'].str.startswith("TR"))]
#counts_df = counts_df[(counts_df['transcript'].str.startswith("TRAV")) | (counts_df['transcript'].str.startswith("TRBV"))]
vdj_long, vdj_table = counts_to_usage_df(counts_df, drop_families=[])

KeyError: 'transcript'

In [124]:
mean_usages = vdj_long.groupby("family").mean().rename(columns={"usage_ratio":"usage_percent_mean"}).reset_index()
std_usages = vdj_long.groupby("family").std().rename(columns={"usage_ratio":"usage_percent_std"}).reset_index()
mean_std_usages = pd.merge(mean_usages, std_usages, on="family", how="inner")

In [125]:
# vdj_long.to_csv("DGN_usage_df_nopseudo.csv", index=None)
# vdj_table.to_csv("DGN_usage_table_nopseudo.csv")

In [126]:
vdj_long['usage_ratio'] = vdj_long['usage_ratio']*100 # only do once!
cov_df = pd.read_csv("DGN_covariates_df.csv").set_index("patid")
cov_df["lane"] = cov_df["fcid"] + "_" + cov_df["lane"].astype("str") # covariates
cov_df = pd.get_dummies(cov_df, columns=['fcid', 'lane'])
pat_info_df = pd.merge(cov_df, HLA_df, right_index=True, left_index=True, how="inner")

In [127]:
tcr_columns = [x for x in vdj_table.columns if x.startswith("TR")]
fcid_columns = [x for x in cov_df.columns if x.startswith("fcid")]
lane_columns = [x for x in cov_df.columns if x.startswith("lane")]

In [114]:
# usage_path = "/home/ashteng/TCR_usages/DGN_vdj_usages_table.csv"
# vdj_table.to_csv(usage_path, index=None)

In [93]:
step_res_df_301_all = do_stepwise_ols(HLA_df, counts_df, cov_df, ind_var="DQB1*03:01", cov_cols=[], fdr=True)

doing 159 tests. out of 895 rows, 306 are DQB1*03:01 > 0
TRAV13-1 1.8456030487329914e-06
doing 158 tests. out of 895 rows, 306 are DQB1*03:01 > 0
TRAV8-5 1.894205159356222e-05
doing 157 tests. out of 895 rows, 306 are DQB1*03:01 > 0
TRBV10-3 0.0011034436174010526
doing 156 tests. out of 895 rows, 306 are DQB1*03:01 > 0
TRAV2 0.06289516224761228


In [94]:
step_res_df_301_all

Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr
60,TRAV13-1,0.414876,0.00072,1.160757e-08,5.759419,2e-06
95,TRAV8-5,-2.194896,0.004113,1.198864e-07,-5.336985,1.9e-05
113,TRBV10-3,0.185111,0.00041,7.028303e-06,4.519616,0.001103


In [95]:

step_res_df_301_all_expanded = pd.merge(step_res_df_301_all, mean_std_usages, left_on="TCR", right_on="family", how="inner")

In [96]:
step_res_df_301_all_expanded

Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr,family,usage_percent_mean,usage_percent_std
0,TRAV13-1,0.414876,0.00072,1.160757e-08,5.759419,2e-06,TRAV13-1,5.393459,1.054466
1,TRAV8-5,-2.194896,0.004113,1.198864e-07,-5.336985,1.9e-05,TRAV8-5,17.985501,6.594707
2,TRBV10-3,0.185111,0.00041,7.028303e-06,4.519616,0.001103,TRBV10-3,2.183421,0.58953


In [128]:
res_df_301_all = do_ols(pat_info_df, vdj_table, ind_var="DQB1*03:01", y_cols=tcr_columns, cov_cols=[])

doing 205 tests. out of 895 rows, 306 are DQB1*03:01 > 0


In [129]:
res_df_301_all_expanded = pd.merge(res_df_301_all, mean_std_usages, left_on="TCR", right_on="family", how="inner")

In [130]:
res_df_301_all_expanded.head(10)

Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr,family,usage_percent_mean,usage_percent_std
0,TRAV13-1,0.004144,0.00072,1.176778e-08,5.757018,2e-06,TRAV13-1,0.0539,0.010539
1,TRAV8-5,-0.021526,0.003945,6.295699e-08,-5.456378,6e-06,TRAV8-5,0.179731,0.06583
2,TRBV10-3,0.001753,0.000386,6.407027e-06,4.539622,0.000438,TRBV10-3,0.020613,0.005558
3,TRAV8-4,0.001788,0.000451,7.943084e-05,3.964494,0.004071,TRAV8-4,0.029961,0.006522
4,TRAV20,0.001341,0.000374,0.0003495513,3.589352,0.014332,TRAV20,0.015075,0.005323
5,TRAV17,0.001796,0.000521,0.0005954209,3.446027,0.020344,TRAV17,0.024982,0.007902
6,TRAJ7,0.001064,0.000336,0.00157597,3.170067,0.044612,TRAJ7,0.00912,0.004869
7,TRBV12-3,0.000856,0.000273,0.001751016,3.138987,0.044612,TRBV12-3,0.010844,0.003919
8,TRAV23/DV6,0.003243,0.001044,0.001958559,3.105649,0.044612,TRAV23/DV6,0.020505,0.014751
9,TRBV12-4,0.002808,0.000957,0.003426502,2.934492,0.070243,TRBV12-4,0.051195,0.013557


In [75]:
HLA_df[[x for x in HLA_df.columns if x.startswith("DQA")]].apply(lambda x: x > 0).astype("int").sum()

DQA1*01:01    182
DQA1*01:02    300
DQA1*01:03    105
DQA1*01:04     47
DQA1*01:05     14
DQA1*02:01    250
DQA1*03:01    185
DQA1*03:02     12
DQA1*03:03    119
DQA1*04:01     40
DQA1*05:01    187
DQA1*05:03      2
DQA1*05:05    214
DQA1*05:09      2
DQA1*06:01      7
dtype: int64

In [26]:
#top_TCR_families = set(step_res_df_301_all['TCR'].values)

In [None]:
# filt_HLA_df = HLA_df[HLA_df["HLA_DQA1_0101"] > 0.5]
# step_res_df_301_all_DQA10101 = do_stepwise_ols(filt_HLA_df, counts_df, "HLA_DQB1_0301", want_families=top_TCR_families, fdr=False)

In [None]:
# filt_HLA_df = HLA_df[HLA_df["HLA_DQA1_0102"] > 0.5]
# step_res_df_301_all_DQA10102 = do_stepwise_ols(filt_HLA_df, counts_df, "HLA_DQB1_0301", want_families=top_TCR_families, fdr=False)

In [None]:
# filt_HLA_df = HLA_df[HLA_df["HLA_DQA1_0501"] > 0.5]
# step_res_df_301_all_DQA10501 = do_stepwise_ols(filt_HLA_df, counts_df, "HLA_DQB1_0301", want_families=top_TCR_families, fdr=False)

In [None]:
# DQA_columns = [col for col in HLA_df.columns if col.startswith("DQA1")]
# DQB_columns = [col for col in HLA_df.columns if col.startswith("DQB1")]

# HLA_df[HLA_df["DQB1*03:01"] > 0][DQB_columns].sum()

# HLA_df[HLA_df["DQB1*03:01"] == 0][DQB_columns].sum()

In [133]:
cond_A = HLA_df['DQB1*03:01'] > 0
cond_B = HLA_df['DQB1*03:02'] > 0
filt_HLA_df = HLA_df[(cond_A & ~cond_B) | (cond_B & ~cond_A)]
step_res_df_301_302 = do_stepwise_ols(filt_HLA_df, counts_df, cov_df, ind_var="DQB1*03:01", cov_cols=[], fdr=True)
step_res_df_301_302_expanded = pd.merge(step_res_df_301_302, mean_std_usages, left_on="TCR", right_on="family", how="inner")

doing 91 tests. out of 418 rows, 266 are DQB1*03:01 > 0
TRBV20-1 0.0008037224697827661
doing 90 tests. out of 418 rows, 266 are DQB1*03:01 > 0
TRBV15 0.0005788162382212386
doing 89 tests. out of 418 rows, 266 are DQB1*03:01 > 0
TRAV29/DV5 0.005443220023340737
doing 88 tests. out of 418 rows, 266 are DQB1*03:01 > 0
TRAV23/DV6 0.014942410435343074
doing 87 tests. out of 418 rows, 266 are DQB1*03:01 > 0
TRBV5-6 0.20995037228756946


In [134]:
step_res_df_301_302_expanded

Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr,family,usage_percent_mean,usage_percent_std
0,TRBV20-1,0.704269,0.001565,9e-06,4.499977,0.000804,TRBV20-1,7.704853,1.705944
1,TRBV15,0.189745,0.000415,6e-06,4.57025,0.000579,TRBV15,1.326454,0.676415
2,TRAV29/DV5,-0.310851,0.000768,6.1e-05,-4.049778,0.005443,TRAV29/DV5,3.493069,0.80418
3,TRAV23/DV6,0.904373,0.002383,0.00017,3.794656,0.014942,TRAV23/DV6,2.501602,1.673355


In [176]:
dqa_cols = [x for x in HLA_df.columns if x.startswith("DQA")]
step_res_df_301_all_cov_dqa = do_stepwise_ols(HLA_df, counts_df, cov_df, ind_var="DQB1*03:01", cov_cols=dqa_cols, fdr=True)

doing 91 tests. out of 895 rows, 306 are DQB1*03:01 > 0
                            OLS Regression Results                            
Dep. Variable:                TRAV1-1   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     1.618
Date:                Sun, 21 Jun 2020   Prob (F-statistic):             0.0581
Time:                        21:03:52   Log-Likelihood:                 3539.5
No. Observations:                 895   AIC:                            -7045.
Df Residuals:                     878   BIC:                            -6963.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

                            OLS Regression Results                            
Dep. Variable:             TRAV14/DV4   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1.551
Date:                Sun, 21 Jun 2020   Prob (F-statistic):             0.0758
Time:                        21:03:52   Log-Likelihood:                 2663.7
No. Observations:                 895   AIC:                            -5293.
Df Residuals:                     878   BIC:                            -5212.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0303      0.002     13.158      0.0

                            OLS Regression Results                            
Dep. Variable:                  TRAV2   R-squared:                       0.058
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     3.394
Date:                Sun, 21 Jun 2020   Prob (F-statistic):           7.25e-06
Time:                        21:03:52   Log-Likelihood:                 3393.4
No. Observations:                 895   AIC:                            -6753.
Df Residuals:                     878   BIC:                            -6671.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0258      0.001     25.324      0.0

                            OLS Regression Results                            
Dep. Variable:                 TRAV24   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                    0.9129
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.554
Time:                        21:03:53   Log-Likelihood:                 3108.2
No. Observations:                 895   AIC:                            -6182.
Df Residuals:                     878   BIC:                            -6101.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0108      0.001      7.703      0.0

                            OLS Regression Results                            
Dep. Variable:             TRAV29/DV5   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     2.501
Date:                Sun, 21 Jun 2020   Prob (F-statistic):           0.000940
Time:                        21:03:53   Log-Likelihood:                 3079.3
No. Observations:                 895   AIC:                            -6125.
Df Residuals:                     878   BIC:                            -6043.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0339      0.001     23.415      0.0

                            OLS Regression Results                            
Dep. Variable:             TRAV36/DV7   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     2.383
Date:                Sun, 21 Jun 2020   Prob (F-statistic):            0.00171
Time:                        21:03:53   Log-Likelihood:                 3820.5
No. Observations:                 895   AIC:                            -7607.
Df Residuals:                     878   BIC:                            -7526.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0109      0.001     17.157      0.0

                            OLS Regression Results                            
Dep. Variable:                 TRAV40   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                 -0.012
Method:                 Least Squares   F-statistic:                    0.3193
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.995
Time:                        21:03:53   Log-Likelihood:                 4605.1
No. Observations:                 895   AIC:                            -9176.
Df Residuals:                     878   BIC:                            -9095.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0019      0.000      7.364      0.0

                            OLS Regression Results                            
Dep. Variable:                TRAV8-1   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     2.826
Date:                Sun, 21 Jun 2020   Prob (F-statistic):           0.000169
Time:                        21:03:53   Log-Likelihood:                 3789.3
No. Observations:                 895   AIC:                            -7545.
Df Residuals:                     878   BIC:                            -7463.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0134      0.001     20.505      0.0

                            OLS Regression Results                            
Dep. Variable:                TRAV9-1   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     1.248
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.225
Time:                        21:03:54   Log-Likelihood:                 6044.4
No. Observations:                 895   AIC:                        -1.205e+04
Df Residuals:                     878   BIC:                        -1.197e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0003   5.28e-05      5.882      0.0

                            OLS Regression Results                            
Dep. Variable:               TRBV11-1   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                    0.4131
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.980
Time:                        21:03:54   Log-Likelihood:                 3558.7
No. Observations:                 895   AIC:                            -7083.
Df Residuals:                     878   BIC:                            -7002.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0035      0.001      4.148      0.0

                            OLS Regression Results                            
Dep. Variable:                 TRBV13   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.103
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.347
Time:                        21:03:54   Log-Likelihood:                 2984.0
No. Observations:                 895   AIC:                            -5934.
Df Residuals:                     878   BIC:                            -5852.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0085      0.002      5.302      0.0

                            OLS Regression Results                            
Dep. Variable:                  TRBV2   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     1.633
Date:                Sun, 21 Jun 2020   Prob (F-statistic):             0.0547
Time:                        21:03:54   Log-Likelihood:                 3100.5
No. Observations:                 895   AIC:                            -6167.
Df Residuals:                     878   BIC:                            -6085.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0290      0.001     20.518      0.0

                            OLS Regression Results                            
Dep. Variable:                 TRBV28   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.9607
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.499
Time:                        21:03:55   Log-Likelihood:                 1951.9
No. Observations:                 895   AIC:                            -3870.
Df Residuals:                     878   BIC:                            -3788.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0376      0.005      7.359      0.0

                            OLS Regression Results                            
Dep. Variable:                TRBV4-2   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     1.733
Date:                Sun, 21 Jun 2020   Prob (F-statistic):             0.0360
Time:                        21:03:55   Log-Likelihood:                 4185.1
No. Observations:                 895   AIC:                            -8336.
Df Residuals:                     878   BIC:                            -8255.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0055      0.000     13.056      0.0

                            OLS Regression Results                            
Dep. Variable:                TRBV5-6   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     2.875
Date:                Sun, 21 Jun 2020   Prob (F-statistic):           0.000130
Time:                        21:03:55   Log-Likelihood:                 3656.5
No. Observations:                 895   AIC:                            -7279.
Df Residuals:                     878   BIC:                            -7197.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0167      0.001     22.006      0.0

                            OLS Regression Results                            
Dep. Variable:                TRBV6-4   R-squared:                       0.023
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.297
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.192
Time:                        21:03:55   Log-Likelihood:                 3563.7
No. Observations:                 895   AIC:                            -7093.
Df Residuals:                     878   BIC:                            -7012.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0047      0.001      5.530      0.0

                            OLS Regression Results                            
Dep. Variable:                TRBV7-3   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.101
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.349
Time:                        21:03:55   Log-Likelihood:                 2947.6
No. Observations:                 895   AIC:                            -5861.
Df Residuals:                     878   BIC:                            -5780.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0219      0.002     13.023      0.0

                            OLS Regression Results                            
Dep. Variable:                  TRBV9   R-squared:                       0.023
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.274
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.207
Time:                        21:03:56   Log-Likelihood:                 2796.7
No. Observations:                 895   AIC:                            -5559.
Df Residuals:                     878   BIC:                            -5478.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0247      0.002     12.424      0.0

In [186]:
res_df_301_all_cov_dqa = do_ols(pat_info_df, vdj_table, ind_var="DQB1*03:01", y_cols=tcr_columns, cov_cols=['DQA1*05:05', 'DQA1*03:03', 'DQA1*06:01'])

doing 91 tests. out of 895 rows, 306 are DQB1*03:01 > 0
                            OLS Regression Results                            
Dep. Variable:                TRAV1-1   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     1.601
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.172
Time:                        21:17:26   Log-Likelihood:                 3529.7
No. Observations:                 895   AIC:                            -7049.
Df Residuals:                     890   BIC:                            -7025.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

                            OLS Regression Results                            
Dep. Variable:               TRAV12-3   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.004
Method:                 Least Squares   F-statistic:                   0.09220
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.985
Time:                        21:17:27   Log-Likelihood:                 2853.9
No. Observations:                 895   AIC:                            -5698.
Df Residuals:                     890   BIC:                            -5674.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0366      0.000     88.395      0.0

                            OLS Regression Results                            
Dep. Variable:                 TRAV21   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     4.015
Date:                Sun, 21 Jun 2020   Prob (F-statistic):            0.00311
Time:                        21:17:27   Log-Likelihood:                 2789.7
No. Observations:                 895   AIC:                            -5569.
Df Residuals:                     890   BIC:                            -5545.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0424      0.000     95.300      0.0

                            OLS Regression Results                            
Dep. Variable:                 TRAV30   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                    0.6199
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.648
Time:                        21:17:27   Log-Likelihood:                 3935.4
No. Observations:                 895   AIC:                            -7861.
Df Residuals:                     890   BIC:                            -7837.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0061      0.000     49.659      0.0

                            OLS Regression Results                            
Dep. Variable:                  TRAV5   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9452
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.437
Time:                        21:17:27   Log-Likelihood:                 3497.6
No. Observations:                 895   AIC:                            -6985.
Df Residuals:                     890   BIC:                            -6961.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0151      0.000     75.074      0.0

                            OLS Regression Results                            
Dep. Variable:               TRBV10-1   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.004
Method:                 Least Squares   F-statistic:                    0.2090
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.934
Time:                        21:17:27   Log-Likelihood:                 4103.5
No. Observations:                 895   AIC:                            -8197.
Df Residuals:                     890   BIC:                            -8173.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0024      0.000     23.339      0.0

                            OLS Regression Results                            
Dep. Variable:                 TRBV13   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     2.173
Date:                Sun, 21 Jun 2020   Prob (F-statistic):             0.0702
Time:                        21:17:28   Log-Likelihood:                 2979.4
No. Observations:                 895   AIC:                            -5949.
Df Residuals:                     890   BIC:                            -5925.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0081      0.000     22.453      0.0

                            OLS Regression Results                            
Dep. Variable:                 TRBV28   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     2.403
Date:                Sun, 21 Jun 2020   Prob (F-statistic):             0.0483
Time:                        21:17:28   Log-Likelihood:                 1948.9
No. Observations:                 895   AIC:                            -3888.
Df Residuals:                     890   BIC:                            -3864.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0431      0.001     37.903      0.0

                            OLS Regression Results                            
Dep. Variable:                TRBV5-4   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.353
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.248
Time:                        21:17:28   Log-Likelihood:                 3818.6
No. Observations:                 895   AIC:                            -7627.
Df Residuals:                     890   BIC:                            -7603.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0146      0.000    103.939      0.0

                            OLS Regression Results                            
Dep. Variable:                TRBV6-8   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                    0.4126
Date:                Sun, 21 Jun 2020   Prob (F-statistic):              0.800
Time:                        21:17:28   Log-Likelihood:                 6516.9
No. Observations:                 895   AIC:                        -1.302e+04
Df Residuals:                     890   BIC:                        -1.300e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0001   6.91e-06     21.059      0.0

In [187]:
res_df_301_all_cov_dqa

Unnamed: 0,TCR,coef,se,pvalue,tvalue,pvalue_fdr
55,TRBV14,-0.002686,0.000965,0.005488,-2.783708,0.468746
44,TRAV9-2,-0.005595,0.002341,0.017058,-2.389957,0.468746
17,TRAV23/DV6,0.007599,0.003233,0.018964,2.350541,0.468746
54,TRBV13,-0.003843,0.001673,0.021802,-2.297833,0.468746
41,TRAV8-4,0.003195,0.001430,0.025755,2.233629,0.468746
...,...,...,...,...,...,...
24,TRAV3,0.000057,0.001252,0.963484,0.045795,0.993431
80,TRBV6-5,0.000071,0.001772,0.967984,0.040148,0.993431
48,TRBV11-1,-0.000031,0.000878,0.971597,-0.035615,0.993431
3,TRAV12-1,-0.000022,0.002811,0.993805,-0.007767,0.996673


In [None]:
df1 = pd.merge(step_res_df_301_302, step_res_df_301_601, on="TCR", how="inner", suffixes=('_302', '_601'))
df2 = pd.merge(step_res_df_301_all, df1, on="TCR", how="inner")
res_df = df2[['TCR', 'coef', 'pvalue_fdr', 'coef_302', 'pvalue_302', 'coef_601', 'pvalue_601']]
header = [np.array(["", "03:01 vs non-03:01", "03:01 vs non-03:01", "03:01 vs 03:02", "03:01 vs 03:02", "03:01 vs 06:01", "03:01 vs 06:01"]), 
np.array(["TCR", "coef", "p-value (FDR)", "coef", "p-value (non-FDR)", "coef", "p-value (non-FDR)"])] 
res_df.columns = header

In [None]:
res_df.to_csv("3_group_coef_pvalues.csv")

In [None]:
num_rows, num_cols = len(res_df), 3
pvalues_mat = np.zeros((num_rows, num_cols))
coefs_mat = np.zeros((num_rows, num_cols))
tcrs = res_df['TCR'].values

suffixes = ['', '_601', '_302']

for i, suffix in enumerate(suffixes):
    pvalues = res_df['pvalue'+suffix].values
    coefs = res_df['coef'+suffix].values
    pvalues_mat[:,i] = pvalues
    coefs_mat[:,i] = coefs
    
# signed_log_pvalues_mat = -np.log(pvalues_mat)*np.where(coefs_mat > 0, 1, -1)

fig, (ax1) = plt.subplots(sharex=False, sharey=True, figsize=((pvalues_mat.shape[1]//2+3)*2, (pvalues_mat.shape[0]//2)*2))
fig.tight_layout(pad=3.0)
ax1 = sns.heatmap(coefs_mat, cmap='seismic', center=0, cbar_kws={'label': "coef"}, ax=ax1, annot=pvalues_mat)
ax1.set_title("DQB1*03:01 status predicting TCR family expression")
ax1.set_yticklabels(tcrs, rotation=0)
ax1.set_xticklabels(['ALL', '03:01 XOR 06:01', '03:01 XOR 03:02'])
plt.xlabel("Cohort")
plt.ylabel("TCR family")
bottom, top = ax1.get_ylim()
ax1.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
num_rows, num_cols = len(res_df), 3
pvalues_mat = np.zeros((num_rows, num_cols))
coefs_mat = np.zeros((num_rows, num_cols))
tcrs = res_df['TCR'].values

suffixes = ['', '_601', '_302']

for i, suffix in enumerate(suffixes):
    pvalues = res_df['pvalue'+suffix].values
    coefs = res_df['coef'+suffix].values
    pvalues_mat[:,i] = pvalues
    coefs_mat[:,i] = coefs
    
# signed_log_pvalues_mat = -np.log(pvalues_mat)*np.where(coefs_mat > 0, 1, -1)

fig, (ax1) = plt.subplots(sharex=False, sharey=True, figsize=((pvalues_mat.shape[1]//2+3)*2, (pvalues_mat.shape[0]//2)*2))
fig.tight_layout(pad=3.0)
ax1 = sns.heatmap(coefs_mat, cmap='seismic', center=0, cbar_kws={'label': "coef"}, ax=ax1, annot=pvalues_mat)
ax1.set_title("DQB1*03:01 status predicting TCR family expression")
ax1.set_yticklabels(tcrs, rotation=0)
ax1.set_xticklabels(['ALL', '03:01 XOR 06:01', '03:01 XOR 03:02'])
plt.xlabel("Cohort")
plt.ylabel("TCR family")
bottom, top = ax1.get_ylim()
ax1.set_ylim(bottom + 0.5, top - 0.5)