In [1]:
import pandas as pd
import numpy as np

## Transcript Counts to Family Counts

In [2]:
counts_df = pd.read_csv("DGN_transcript_counts_df.csv", index_col=0, low_memory=False)

# doesn't do anything now but could be used to extract only TRA, for example
df = counts_df[counts_df['transcript'].str.startswith("TR")]

In [3]:
df

Unnamed: 0_level_0,genotype,dose,transcript,count
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LD0691,CC,2,TRAJ1*01,1
LD0691,CC,2,TRAJ10*01,33
LD0691,CC,2,TRAJ11*01,23
LD0691,CC,2,TRAJ12*01,19
LD0691,CC,2,TRAJ13*01,1
...,...,...,...,...
LD0910,GG,0,TRBV7-9*07,305
LD0910,GG,0,TRBV9*01,47
LD0910,GG,0,TRBV9*02,28
LD0910,GG,0,TRBV9*03,266


In [4]:
# split transcripts names (TRAJ24*01) into family (TRAJ24) and subtype (01)
transcript_family_subtype_df = df["transcript"].str.split("*", n = 1, expand = True).rename(columns={0:"family", 1:"subtype"})
df = df.copy()
df.loc[:,'count'] = df['count'].astype(int)
df.loc[:,"family"] = transcript_family_subtype_df["family"]
df.loc[:,"subtype"] = transcript_family_subtype_df["subtype"]
df = df.reset_index()
df = df[['patid', 'family', 'subtype', 'transcript', 'count']] # we don't need to the genotype information for now

In [5]:
df

Unnamed: 0,patid,family,subtype,transcript,count
0,LD0691,TRAJ1,01,TRAJ1*01,1
1,LD0691,TRAJ10,01,TRAJ10*01,33
2,LD0691,TRAJ11,01,TRAJ11*01,23
3,LD0691,TRAJ12,01,TRAJ12*01,19
4,LD0691,TRAJ13,01,TRAJ13*01,1
...,...,...,...,...,...
275728,LD0910,TRBV7-9,07,TRBV7-9*07,305
275729,LD0910,TRBV9,01,TRBV9*01,47
275730,LD0910,TRBV9,02,TRBV9*02,28
275731,LD0910,TRBV9,03,TRBV9*03,266


In [6]:
# get total counts for every patient, every family (adding subtype counts together)
family_count_df = df.groupby(["patid", "family"]).sum().reset_index()#.rename(columns={"count":"family_count"})

In [7]:
family_count_df

Unnamed: 0,patid,family,count
0,LD0001,TRAJ1,2
1,LD0001,TRAJ10,37
2,LD0001,TRAJ11,13
3,LD0001,TRAJ12,14
4,LD0001,TRAJ13,27
...,...,...,...
165720,LD1366,TRBV7-8,366
165721,LD1366,TRBV7-9,1079
165722,LD1366,TRBV8-2,2
165723,LD1366,TRBV9,391


In [8]:
# Filter out pseudogenes and non-functional genes
valid_TCR_genes = set()
with open("valid_TCR_genes.txt", "r") as f:
    for line in f:
        valid_TCR_genes.add(line.strip())
family_count_df = family_count_df[family_count_df['family'].isin(valid_TCR_genes)]

# only want patids in Sharon's paper, N=895, due to quality control
sharon_counts_df = pd.read_csv("gene_counts.tsv", delimiter="\t")
sharon_counts_df = sharon_counts_df.reset_index()
sharon_ids = set(sharon_counts_df.columns[1:])
family_count_df = family_count_df[family_count_df['patid'].isin(sharon_ids)]

family_count_df.to_csv("DGN_family_counts_df.csv", index=None)

family_counts_table = family_count_df.pivot(index="patid", columns="family", values="count").fillna(0.0)

family_counts_table.to_csv("DGN_family_counts_table.csv")

In [9]:
family_counts_table

family,TRAJ1,TRAJ10,TRAJ11,TRAJ12,TRAJ13,TRAJ14,TRAJ16,TRAJ17,TRAJ18,TRAJ2,...,TRBV6-6,TRBV6-8,TRBV6-9,TRBV7-2,TRBV7-3,TRBV7-6,TRBV7-7,TRBV7-8,TRBV7-9,TRBV9
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LD0001,2.0,37.0,13.0,14.0,27.0,0.0,18.0,22.0,32.0,1.0,...,200.0,3.0,32.0,557.0,216.0,68.0,39.0,167.0,502.0,216.0
LD0002,6.0,51.0,14.0,27.0,50.0,1.0,21.0,39.0,31.0,1.0,...,331.0,4.0,46.0,698.0,371.0,121.0,55.0,264.0,807.0,418.0
LD0003,6.0,59.0,14.0,15.0,31.0,0.0,16.0,26.0,25.0,2.0,...,490.0,0.0,33.0,520.0,102.0,83.0,54.0,162.0,644.0,312.0
LD0006,1.0,40.0,20.0,26.0,45.0,1.0,28.0,27.0,47.0,7.0,...,280.0,1.0,23.0,864.0,221.0,95.0,36.0,231.0,687.0,243.0
LD0007,4.0,61.0,18.0,26.0,44.0,2.0,20.0,31.0,39.0,2.0,...,415.0,4.0,32.0,1225.0,282.0,112.0,34.0,255.0,891.0,261.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LD1357,1.0,30.0,13.0,18.0,34.0,1.0,8.0,14.0,18.0,0.0,...,221.0,1.0,15.0,636.0,249.0,97.0,38.0,174.0,445.0,318.0
LD1361,5.0,48.0,27.0,15.0,53.0,1.0,17.0,30.0,35.0,10.0,...,449.0,1.0,55.0,817.0,299.0,128.0,57.0,333.0,1075.0,325.0
LD1362,5.0,27.0,10.0,23.0,33.0,0.0,15.0,29.0,19.0,4.0,...,176.0,0.0,19.0,372.0,151.0,77.0,26.0,182.0,731.0,201.0
LD1364,2.0,42.0,13.0,17.0,31.0,0.0,21.0,33.0,22.0,0.0,...,303.0,3.0,47.0,1059.0,200.0,127.0,154.0,350.0,839.0,400.0


## Amino Acid Position Counts

In [2]:
TRV = "TRAV" # "TRAV"

In [3]:
family_counts_table = pd.read_csv("DGN_family_counts_table.csv").set_index("patid")

In [4]:
family_counts_table

Unnamed: 0_level_0,TRAJ1,TRAJ10,TRAJ11,TRAJ12,TRAJ13,TRAJ14,TRAJ16,TRAJ17,TRAJ18,TRAJ2,...,TRBV6-6,TRBV6-8,TRBV6-9,TRBV7-2,TRBV7-3,TRBV7-6,TRBV7-7,TRBV7-8,TRBV7-9,TRBV9
patid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LD0001,2.0,37.0,13.0,14.0,27.0,0.0,18.0,22.0,32.0,1.0,...,200.0,3.0,32.0,557.0,216.0,68.0,39.0,167.0,502.0,216.0
LD0002,6.0,51.0,14.0,27.0,50.0,1.0,21.0,39.0,31.0,1.0,...,331.0,4.0,46.0,698.0,371.0,121.0,55.0,264.0,807.0,418.0
LD0003,6.0,59.0,14.0,15.0,31.0,0.0,16.0,26.0,25.0,2.0,...,490.0,0.0,33.0,520.0,102.0,83.0,54.0,162.0,644.0,312.0
LD0006,1.0,40.0,20.0,26.0,45.0,1.0,28.0,27.0,47.0,7.0,...,280.0,1.0,23.0,864.0,221.0,95.0,36.0,231.0,687.0,243.0
LD0007,4.0,61.0,18.0,26.0,44.0,2.0,20.0,31.0,39.0,2.0,...,415.0,4.0,32.0,1225.0,282.0,112.0,34.0,255.0,891.0,261.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LD1357,1.0,30.0,13.0,18.0,34.0,1.0,8.0,14.0,18.0,0.0,...,221.0,1.0,15.0,636.0,249.0,97.0,38.0,174.0,445.0,318.0
LD1361,5.0,48.0,27.0,15.0,53.0,1.0,17.0,30.0,35.0,10.0,...,449.0,1.0,55.0,817.0,299.0,128.0,57.0,333.0,1075.0,325.0
LD1362,5.0,27.0,10.0,23.0,33.0,0.0,15.0,29.0,19.0,4.0,...,176.0,0.0,19.0,372.0,151.0,77.0,26.0,182.0,731.0,201.0
LD1364,2.0,42.0,13.0,17.0,31.0,0.0,21.0,33.0,22.0,0.0,...,303.0,3.0,47.0,1059.0,200.0,127.0,154.0,350.0,839.0,400.0


In [10]:
AA_seq_df = pd.read_csv("{}_AA_sequences.csv".format(TRV))[['family', 'CDR1_sequence', 'CDR2_sequence']]

AA_seq_df['CDR_seq'] = (AA_seq_df['CDR1_sequence'] + AA_seq_df['CDR2_sequence']).apply(lambda x: list(x))

CDR_NUM_AA = 22 # 22 positions, first 12 are CDR1, then 10 are CDR2
DGN_NUM_PATIENTS = 895

family_sequence_dict = dict()
for index, row in AA_seq_df.iterrows():
    family_sequence_dict[row['family']] = row['CDR_seq']
    
normalize_by_total = False
patid_total_counts = family_counts_table.sum(axis=1).to_dict()

In [9]:
AA_seq_df.head()

Unnamed: 0,family,CDR1_sequence,CDR2_sequence,CDR_seq
0,TRAV1-1,TSG......FYG,NAL....DGL,"[T, S, G, ., ., ., ., ., ., F, Y, G, N, A, L, ..."
1,TRAV1-2,TSG......FNG,NVL....DGL,"[T, S, G, ., ., ., ., ., ., F, N, G, N, V, L, ..."
2,TRAV2,VSN......AYN,GS......KP,"[V, S, N, ., ., ., ., ., ., A, Y, N, G, S, ., ..."
3,TRAV3,VSG......NPY,YITG..DNLV,"[V, S, G, ., ., ., ., ., ., N, P, Y, Y, I, T, ..."
4,TRAV4,NIAT.....NDY,GYK.....TK,"[N, I, A, T, ., ., ., ., ., N, D, Y, G, Y, K, ..."


In [24]:
set1 = set(family_counts_table.columns)
set2 = set(AA_seq_df["family"])

In [25]:
set1 - set2

{'TRAJ1',
 'TRAJ10',
 'TRAJ11',
 'TRAJ12',
 'TRAJ13',
 'TRAJ14',
 'TRAJ16',
 'TRAJ17',
 'TRAJ18',
 'TRAJ2',
 'TRAJ20',
 'TRAJ21',
 'TRAJ22',
 'TRAJ23',
 'TRAJ24',
 'TRAJ25',
 'TRAJ26',
 'TRAJ27',
 'TRAJ28',
 'TRAJ29',
 'TRAJ3',
 'TRAJ30',
 'TRAJ31',
 'TRAJ32',
 'TRAJ33',
 'TRAJ34',
 'TRAJ35',
 'TRAJ36',
 'TRAJ37',
 'TRAJ38',
 'TRAJ39',
 'TRAJ4',
 'TRAJ40',
 'TRAJ41',
 'TRAJ42',
 'TRAJ43',
 'TRAJ44',
 'TRAJ45',
 'TRAJ46',
 'TRAJ47',
 'TRAJ48',
 'TRAJ49',
 'TRAJ5',
 'TRAJ50',
 'TRAJ52',
 'TRAJ53',
 'TRAJ54',
 'TRAJ56',
 'TRAJ57',
 'TRAJ58',
 'TRAJ6',
 'TRAJ61',
 'TRAJ7',
 'TRAJ9',
 'TRBJ1-1',
 'TRBJ1-2',
 'TRBJ1-3',
 'TRBJ1-4',
 'TRBJ1-5',
 'TRBJ1-6',
 'TRBJ2-1',
 'TRBJ2-2',
 'TRBJ2-3',
 'TRBJ2-4',
 'TRBJ2-5',
 'TRBJ2-6',
 'TRBJ2-7',
 'TRBV10-1',
 'TRBV10-2',
 'TRBV10-3',
 'TRBV11-1',
 'TRBV11-2',
 'TRBV11-3',
 'TRBV12-3',
 'TRBV12-4',
 'TRBV12-5',
 'TRBV13',
 'TRBV14',
 'TRBV15',
 'TRBV18',
 'TRBV19',
 'TRBV2',
 'TRBV20-1',
 'TRBV24-1',
 'TRBV25-1',
 'TRBV27',
 'TRBV28',
 'TRBV29-1',
 '

In [26]:
set2 - set1

{'TRAV11', 'TRAV8-7'}

In [13]:
patid_position_AA_dict = dict()
for i in range(DGN_NUM_PATIENTS):
    family_counts = family_counts_table.iloc[i]
    patid = family_counts.name
    patid_total_count = patid_total_counts[patid]
    position_AA_dict = dict()
    for family, count in family_counts.iteritems(): # this patient's count for each family
        if family in family_sequence_dict: # if we have the AA sequence from IMGT for this family
            if not np.isnan(count): # if counts are not null
                for pos, AA in enumerate(family_sequence_dict[family]): # for CDR AA at each position
                    if pos not in position_AA_dict:
                        position_AA_dict[pos] = dict()
                    if AA not in position_AA_dict[pos]:
                        position_AA_dict[pos][AA] = 0
                    if normalize_by_total:
                        position_AA_dict[pos][AA] += count / patid_total_count
                    else:
                        position_AA_dict[pos][AA] += count
    patid_position_AA_dict[patid] = position_AA_dict

"""
position_AA_dict structure for each patid: 22 positions, different AA counts at each position
{0: {'T': 1278.0,
  'V': 629.0,
  'N': 1146.0,
  'D': 1102.0,
  'Y': 366.0,
  'S': 776.0,
  'K': 44.0,
  'A': 409.0},
 1: {'S': 3398.0,
  'R': 507.0,
  'T': 752.0,
  'I': 506.0,
  'V': 106.0,
  'A': 27.0,
  'G': 376.0,
  'Y': 78.0},
...}
"""

position_lists = [[] for _ in range(CDR_NUM_AA)]
for patid in patid_position_AA_dict:
    patid_df = pd.DataFrame.from_dict(patid_position_AA_dict[patid]) # get the above dict for each patient
    for position in range(CDR_NUM_AA):
        series = patid_df[position]
        series.name = patid
        position_lists[position].append(series)
position_dfs = [pd.concat(position_list, axis=1, sort=True) for position_list in position_lists]

for i in range(CDR_NUM_AA):
    position_dfs[i] = position_dfs[i][position_dfs[i].isnull().sum(axis=1) < DGN_NUM_PATIENTS].fillna(0.0).transpose()
    position_dfs[i].index.name = "patid"

for i in range(CDR_NUM_AA):
    position_dfs[i].to_csv("DGN_{}_CDR1_CDR2_position_{}_AA_counts_table.csv".format(TRV, i))

KeyboardInterrupt: 