## This notebook converts VDJCounts.txt into allVDJRatios.csv. Calculates V-usage and J-usage from counts. Filters out Pseudogenes. Only extract patids that Sharon used (N=895)

In [2]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import pickle
import scipy
%matplotlib inline

In [3]:
def counts_to_usage_df(counts_df, drop_families=[]):

    # doesn't do anything now but could be used to extract only TRA, for example
    df = counts_df[counts_df['transcript'].str.startswith("TR")]

    # split transcripts names (TRAJ24*01) into family (TRAJ24) and subtype (01)
    transcript_family_subtype_df = df["transcript"].str.split("*", n = 1, expand = True).rename(columns={0:"family", 1:"subtype"})
    df = df.copy()
    df.loc[:,'count'] = df['count'].astype(int)
    df.loc[:,"family"] = transcript_family_subtype_df["family"]
    df.loc[:,"subtype"] = transcript_family_subtype_df["subtype"]
    df = df.reset_index()
    df = df[['patid', 'family', 'subtype', 'transcript', 'count']] # we don't need to the genotype information for now
    
    # drop any families (so that they are not included in total count)
    if len(drop_families) > 0:
        df = df[~df['family'].isin(drop_families)]
    
    # get total counts for every patient, every family (adding subtype counts together)
    family_count_df = df.groupby(["patid", "family"]).sum().rename(columns={"count":"family_count"}).reset_index()

    # take out the segment (AJ) out of the family (TRAJ1), and groupby patid+segment to get total AJ, total BJ, AV, BV for each patient
    family_count_df['segment'] = family_count_df['family'].str.slice(start=2, stop=4)
    segment_count_df = family_count_df.groupby(["patid", "segment"]).sum().rename(columns={"family_count":"segment_count"}).reset_index()

    # merge the family count and segment count dfs, allowing us to calculate usage ratio
    df_with_total = pd.merge(family_count_df, segment_count_df, how="inner", on=["patid", "segment"])
    df_with_total["usage_ratio"] = df_with_total["family_count"]/df_with_total["segment_count"]
    # df_with_total["patid"] = df_with_total["patid"].apply(lambda x: int(x[2:])) # get rid of "LD" in front of patid
    vdj_df = df_with_total[["patid", "family", "usage_ratio"]]

    # Filter out pseudogenes and non-functional genes
    valid_TCR_genes = set()
    with open("valid_TCR_genes.txt", "r") as f:
        for line in f:
            valid_TCR_genes.add(line.strip())
    vdj_df = vdj_df[vdj_df['family'].isin(valid_TCR_genes)]

    # make both long and wide formats for the usage_ratio table
    vdj_table = vdj_df.pivot(index="patid", columns="family", values="usage_ratio").fillna(0)
    vdj_table = vdj_table.reindex(sorted(vdj_table.columns), axis=1).fillna(0)
    vdj_table = vdj_table.reset_index()

    # only want patids in Sharon's paper, N=895, due to quality control
    sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
    sharon_counts_df = sharon_counts_df.reset_index()
    sharon_ids = set(sharon_counts_df.columns[1:])
    vdj_table = vdj_table[vdj_table['patid'].isin(sharon_ids)] 

    return vdj_df, vdj_table

In [None]:
# transcripts counts to usage
def counts_to_usage_df(counts_df, drop_families=[]):
    df = counts_df
    # split transcripts names (TRAJ24*01) into family (TRAJ24) and subtype (01)
    transcript_family_subtype_df = df["transcript"].str.split("*", n = 1, expand = True).rename(columns={0:"family", 1:"subtype"})
    df = df.copy()
    df.loc[:,'count'] = df['count'].astype(int)
    df.loc[:,"family"] = transcript_family_subtype_df["family"]
    df.loc[:,"subtype"] = transcript_family_subtype_df["subtype"]
    df = df.reset_index()
    df = df[['patid', 'family', 'subtype', 'transcript', 'count']] # we don't need to the genotype information for now
    
    # drop any families (so that they are not included in total count)
    if len(drop_families) > 0:
        df = df[~df['family'].isin(drop_families)]

#     # Filter out pseudogenes and non-functional genes
#     valid_TCR_genes = set()
#     with open("valid_TCR_genes.txt", "r") as f:
#         for line in f:
#             valid_TCR_genes.add(line.strip())
#     valid_TCR_genes.add("TRAV8-5")
#     df = df[df['family'].isin(valid_TCR_genes)]
    
    # get total counts for every patient, every family (adding subtype counts together)
    family_count_df = df.groupby(["patid", "family"]).sum().rename(columns={"count":"family_count"}).reset_index()

    # take out the segment (AJ) out of the family (TRAJ1), and groupby patid+segment to get total AJ, total BJ, AV, BV for each patient
    family_count_df['segment'] = family_count_df['family'].str.slice(start=2, stop=4)
    segment_count_df = family_count_df.groupby(["patid", "segment"]).sum().rename(columns={"family_count":"segment_count"}).reset_index()

    # merge the family count and segment count dfs, allowing us to calculate usage ratio
    df_with_total = pd.merge(family_count_df, segment_count_df, how="inner", on=["patid", "segment"])
    df_with_total["usage_ratio"] = df_with_total["family_count"]/df_with_total["segment_count"]
    # df_with_total["patid"] = df_with_total["patid"].apply(lambda x: int(x[2:])) # get rid of "LD" in front of patid
    vdj_df = df_with_total[["patid", "family", "usage_ratio"]]

    # make both long and wide formats for the usage_ratio table
    vdj_table = vdj_df.pivot(index="patid", columns="family", values="usage_ratio").fillna(0)
    vdj_table = vdj_table.reindex(sorted(vdj_table.columns), axis=1).fillna(0)
    vdj_table = vdj_table.reset_index()

    # only want patids in Sharon's paper, N=895, due to quality control
    sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
    sharon_counts_df = sharon_counts_df.reset_index()
    sharon_ids = set(sharon_counts_df.columns[1:])
    vdj_table = vdj_table[vdj_table['patid'].isin(sharon_ids)].set_index("patid")

    return vdj_df, vdj_table

In [4]:
df = pd.read_csv("VDJCounts.txt", index_col=0, low_memory=False)

In [5]:
to_drop = []

"""
['TRAV13-1']
['TRAV13-1', 'TRAV17']
['TRAV13-1', 'TRAV17', 'TRAV13-2']
['TRAV13-1', 'TRAV17', 'TRAV13-2', 'TRAJ13']
"""

"\n['TRAV13-1']\n['TRAV13-1', 'TRAV17']\n['TRAV13-1', 'TRAV17', 'TRAV13-2']\n['TRAV13-1', 'TRAV17', 'TRAV13-2', 'TRAJ13']\n"

In [6]:
vdj_df, vdj_table = counts_to_usage_df(df, drop_families=to_drop)

In [7]:
# # plotting gene expression
# f, ax = plt.subplots(figsize=(5,50))
# sns.boxplot(y="family", x="usage_ratio", data=vdj_df, fliersize=1.0)
# plt.xlim((0,0.5))
# plt.title("Protein Coding TCR Genes, without {}".format(to_drop))

In [9]:
# save both long and wide formats to disk
vdj_df.to_csv("DGN_vdj_usages.csv", index=None)
vdj_table.to_csv("DGN_vdj_usages_table.csv", index=None)

## repurposed code used to plot before & after drop usages

In [9]:
cd4_mean = vdj_table.mean().to_frame("cd4_mean")
cd8_mean = vdj_table_orig.mean().to_frame("cd8_mean")
merged_df = pd.merge(cd4_mean, cd8_mean, left_index=True, right_index=True, how="inner").fillna(0.0) # merge on TCR
merged_df = merged_df.reset_index()
merged_df['segment'] = merged_df['family'].str.slice(start=0, stop=4)

In [10]:
#segment = "TRAV" # change this flag for different plots
plot_df = merged_df
#plot_df = merged_df[merged_df['segment']==segment]
r, p = scipy.stats.pearsonr(plot_df['cd4_mean'], plot_df['cd8_mean'])

sns.set()
fig, ax = plt.subplots(figsize=(20,60))
sns.scatterplot(x="cd4_mean", y="cd8_mean", hue="segment", data=plot_df)
plt.xlim(0.00, 0.06)
plt.ylim(0.00, 0.06)
ax.set_aspect('equal', 'box')
plt.ylabel("No Dropped")
plt.xlabel("Dropped {}".format(to_drop))
plt.title("Pearson's R: {:4f}, (two-tailed p-value {:4f})".format(r, p))

lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]
# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)

def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        if abs(point['x'] - point['y']) > 0.001:
        #if str(point['val']) in ['TRAV23/DV6', 'TRAV13-1', 'TRAV10-1', 'TRAV17']:
            ax.text(point['x']+.001, point['y'], str(point['val']))

label_point(plot_df['cd4_mean'], plot_df['cd8_mean'], plot_df['family'], plt.gca())

NameError: name 'merged_df' is not defined