## This notebook converts VDJCounts.txt into allVDJRatios.csv. Calculates V-usage and J-usage from counts. Filters out Pseudogenes. Only extract patids that Sharon used (N=895)

In [3]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import pickle
%matplotlib inline

In [4]:
def counts_to_usage_df(counts_df):

    # doesn't do anything now but could be used to extract only TRA, for example
    df = counts_df[counts_df['transcript'].str.startswith("TR")]

    # split transcripts names (TRAJ24*01) into family (TRAJ24) and subtype (01)
    transcript_family_subtype_df = df["transcript"].str.split("*", n = 1, expand = True).rename(columns={0:"family", 1:"subtype"})
    df = df.copy()
    df.loc[:,'count'] = df['count'].astype(int)
    df.loc[:,"family"] = transcript_family_subtype_df["family"]
    df.loc[:,"subtype"] = transcript_family_subtype_df["subtype"]
    df = df.reset_index()
    df = df[['patid', 'family', 'subtype', 'transcript', 'count']] # we don't need to the genotype information for now

    # get total counts for every patient, every family (adding subtype counts together)
    family_count_df = df.groupby(["patid", "family"]).sum().rename(columns={"count":"family_count"}).reset_index()

    # take out the segment (AJ) out of the family (TRAJ1), and groupby patid+segment to get total AJ, total BJ, AV, BV for each patient
    family_count_df['segment'] = family_count_df['family'].str.slice(start=2, stop=4)
    segment_count_df = family_count_df.groupby(["patid", "segment"]).sum().rename(columns={"family_count":"segment_count"}).reset_index()

    # merge the family count and segment count dfs, allowing us to calculate usage ratio
    df_with_total = pd.merge(family_count_df, segment_count_df, how="inner", on=["patid", "segment"])
    df_with_total["usage_ratio"] = df_with_total["family_count"]/df_with_total["segment_count"]
    # df_with_total["patid"] = df_with_total["patid"].apply(lambda x: int(x[2:])) # get rid of "LD" in front of patid
    vdj_df = df_with_total[["patid", "family", "usage_ratio"]]

    # Filter out pseudogenes and non-functional genes
    valid_TCR_genes = set()
    with open("valid_TCR_genes.txt", "r") as f:
        for line in f:
            valid_TCR_genes.add(line.strip())
    vdj_df = vdj_df[vdj_df['family'].isin(valid_TCR_genes)]

    # make both long and wide formats for the usage_ratio table
    vdj_table = vdj_df.pivot(index="patid", columns="family", values="usage_ratio").fillna(0)
    vdj_table = vdj_table.reindex(sorted(vdj_table.columns), axis=1).fillna(0)
    vdj_table = vdj_table.reset_index()

    # only want patids in Sharon's paper, N=895, due to quality control
    sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
    sharon_counts_df = sharon_counts_df.reset_index()
    sharon_ids = set(sharon_counts_df.columns[1:])
    vdj_table = vdj_table[vdj_table['patid'].isin(sharon_ids)] 

    return vdj_df, vdj_table

In [5]:
df = pd.read_csv("VDJCounts.txt", index_col=0, low_memory=False)

In [6]:
a, b = counts_to_usage_df(df)

In [8]:
b

family,patid,TRAJ1,TRAJ10,TRAJ11,TRAJ12,TRAJ13,TRAJ14,TRAJ16,TRAJ17,TRAJ18,...,TRBV6-6,TRBV6-8,TRBV6-9,TRBV7-2,TRBV7-3,TRBV7-6,TRBV7-7,TRBV7-8,TRBV7-9,TRBV9
0,LD0001,0.001676,0.031014,0.010897,0.011735,0.022632,0.000000,0.015088,0.018441,0.026823,...,0.021311,0.000320,0.003410,0.059350,0.023015,0.007246,0.004156,0.017794,0.053490,0.023015
1,LD0002,0.003399,0.028895,0.007932,0.015297,0.028329,0.000567,0.011898,0.022096,0.017564,...,0.023866,0.000288,0.003317,0.050328,0.026750,0.008724,0.003966,0.019035,0.058187,0.030139
2,LD0003,0.004228,0.041579,0.009866,0.010571,0.021846,0.000000,0.011276,0.018323,0.017618,...,0.048448,0.000000,0.003263,0.051414,0.010085,0.008206,0.005339,0.016017,0.063674,0.030848
3,LD0006,0.000546,0.021846,0.010923,0.014200,0.024577,0.000546,0.015292,0.014746,0.025669,...,0.021145,0.000076,0.001737,0.065247,0.016689,0.007174,0.002719,0.017444,0.051880,0.018351
4,LD0007,0.002127,0.032430,0.009569,0.013822,0.023392,0.001063,0.010633,0.016481,0.020734,...,0.025584,0.000247,0.001973,0.075519,0.017385,0.006905,0.002096,0.015720,0.054929,0.016090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,LD1357,0.001006,0.030181,0.013078,0.018109,0.034205,0.001006,0.008048,0.014085,0.018109,...,0.022377,0.000101,0.001519,0.064399,0.025213,0.009822,0.003848,0.017618,0.045059,0.032199
918,LD1361,0.002919,0.028021,0.015762,0.008757,0.030940,0.000584,0.009924,0.017513,0.020432,...,0.025029,0.000056,0.003066,0.045543,0.016668,0.007135,0.003177,0.018563,0.059925,0.018117
919,LD1362,0.004488,0.024237,0.008977,0.020646,0.029623,0.000000,0.013465,0.026032,0.017056,...,0.016566,0.000000,0.001788,0.035015,0.014213,0.007248,0.002447,0.017131,0.068806,0.018919
920,LD1364,0.001487,0.031227,0.009665,0.012639,0.023048,0.000000,0.015613,0.024535,0.016357,...,0.021641,0.000214,0.003357,0.075637,0.014285,0.009071,0.010999,0.024998,0.059924,0.028569


In [None]:
# save both long and wide formats to disk
vdj_df.to_csv("DGN_vdj_usages.csv", index=None)
vdj_table.to_csv("DGN_vdj_usages_table.csv", index=None)