In [1]:
import pandas as pd
import numpy as np

## Transcript Counts to Family Counts

In [2]:
counts_df = pd.read_csv("DGN_transcript_counts_df.csv", index_col=0, low_memory=False)

# doesn't do anything now but could be used to extract only TRA, for example
df = counts_df[counts_df['transcript'].str.startswith("TR")]

# split transcripts names (TRAJ24*01) into family (TRAJ24) and subtype (01)
transcript_family_subtype_df = df["transcript"].str.split("*", n = 1, expand = True).rename(columns={0:"family", 1:"subtype"})
df = df.copy()
df.loc[:,'count'] = df['count'].astype(int)
df.loc[:,"family"] = transcript_family_subtype_df["family"]
df.loc[:,"subtype"] = transcript_family_subtype_df["subtype"]
df = df.reset_index()
df = df[['patid', 'family', 'subtype', 'transcript', 'count']] # we don't need to the genotype information for now
# get total counts for every patient, every family (adding subtype counts together)
family_count_df = df.groupby(["patid", "family"]).sum().reset_index()#.rename(columns={"count":"family_count"})

# Filter out pseudogenes and non-functional genes
valid_TCR_genes = set()
with open("valid_TCR_genes.txt", "r") as f:
    for line in f:
        valid_TCR_genes.add(line.strip())
family_count_df = family_count_df[family_count_df['family'].isin(valid_TCR_genes)]

# only want patids in Sharon's paper, N=895, due to quality control
sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
sharon_counts_df = sharon_counts_df.reset_index()
sharon_ids = set(sharon_counts_df.columns[1:])
family_count_df = family_count_df[family_count_df['patid'].isin(sharon_ids)]

family_count_df.to_csv("DGN_family_counts_df.csv", index=None)

family_counts_table = family_count_df.pivot(index="patid", columns="family", values="count").fillna(0.0)

family_counts_table.to_csv("DGN_family_counts_table.csv")

## Amino Acid Position Counts

In [39]:
TRV = "TRBV" # "TRAV"

In [40]:
family_counts_table = pd.read_csv("DGN_family_counts_table.csv").set_index("patid")

AA_seq_df = pd.read_csv("{}_AA_sequences.csv".format(TRV))[['family', 'CDR1_sequence', 'CDR2_sequence']]

AA_seq_df['CDR_seq'] = (AA_seq_df['CDR1_sequence'] + AA_seq_df['CDR2_sequence']).apply(lambda x: list(x))

CDR_NUM_AA = 22 # 22 positions, first 12 are CDR1, then 10 are CDR2
DGN_NUM_PATIENTS = 895

family_sequence_dict = dict()
for index, row in AA_seq_df.iterrows():
    family_sequence_dict[row['family']] = row['CDR_seq']

normalize_by_total = False
patid_total_counts = family_counts_table.sum(axis=1).to_dict()
patid_position_AA_dict = dict()
for i in range(DGN_NUM_PATIENTS):
    family_counts = family_counts_table.iloc[i]
    patid = family_counts.name
    patid_total_count = patid_total_counts[patid]
    position_AA_dict = dict()
    for family, count in family_counts.iteritems(): # this patient's count for each family
        if family in family_sequence_dict: # if we have the AA sequence from IMGT for this family
            if not np.isnan(count): # if counts are not null
                for pos, AA in enumerate(family_sequence_dict[family]): # for CDR AA at each position
                    if pos not in position_AA_dict:
                        position_AA_dict[pos] = dict()
                    if AA not in position_AA_dict[pos]:
                        position_AA_dict[pos][AA] = 0
                    if normalize_by_total:
                        position_AA_dict[pos][AA] += count / patid_total_count
                    else:
                        position_AA_dict[pos][AA] += count
    patid_position_AA_dict[patid] = position_AA_dict

"""
position_AA_dict structure for each patid: 22 positions, different AA counts at each position
{0: {'T': 1278.0,
  'V': 629.0,
  'N': 1146.0,
  'D': 1102.0,
  'Y': 366.0,
  'S': 776.0,
  'K': 44.0,
  'A': 409.0},
 1: {'S': 3398.0,
  'R': 507.0,
  'T': 752.0,
  'I': 506.0,
  'V': 106.0,
  'A': 27.0,
  'G': 376.0,
  'Y': 78.0},
...}
"""

position_lists = [[] for _ in range(CDR_NUM_AA)]
for patid in patid_position_AA_dict:
    patid_df = pd.DataFrame.from_dict(patid_position_AA_dict[patid]) # get the above dict for each patient
    for position in range(CDR_NUM_AA):
        series = patid_df[position]
        series.name = patid
        position_lists[position].append(series)
position_dfs = [pd.concat(position_list, axis=1, sort=True) for position_list in position_lists]



for i in range(CDR_NUM_AA):
    position_dfs[i] = position_dfs[i][position_dfs[i].isnull().sum(axis=1) < DGN_NUM_PATIENTS].fillna(0.0).transpose()
    position_dfs[i].index.name = "patid"

for i in range(CDR_NUM_AA):
    position_dfs[i].to_csv("DGN_{}_CDR1_CDR2_position_{}_AA_counts_table.csv".format(TRV, i))