### Demo 

- `reconstruct_vseq` function can also be found in [./utils.py](./utils.py)

In [2]:
# Reconstruct TCR variable sequence
def reconstruct_vseq(data):
    """
    Input: "vgene_seq-cdr3-jgene_seq"
    Return: 
        - If succeed, return "reconstructed sequence"
        - If fail, return "(UNK)"
    """
    vgene_seq = data.split('-')[0]
    cdr3 = data.split('-')[1]
    jgene_seq = data.split('-')[2]

    # cut vgene seq, eg. CA: ..YL|CAVT
    cdr3_start = cdr3[:2]
    vgene_seq_end = vgene_seq.rfind(cdr3_start)                         # last position

    # cut jgene seq, eg. LQ: GKLQ|FG..
    cdr3_end = cdr3[-2:]
    jgene_seq_start = jgene_seq.find(cdr3_end) + len(cdr3_end)          # first position + 2    

    if (vgene_seq_end != -1) & (jgene_seq_start != (-1+len(cdr3_end))): # + len(cdr3_end) !!!
        return vgene_seq[:vgene_seq_end] + cdr3 + jgene_seq[jgene_seq_start:]
    else:
        return '(UNK)'

In [22]:
import json
import pandas as pd

with open('../data/Hi-TpH-tcr_gene2seq.json', "r") as f:
    seq_dict = json.load(f)

level4_df = pd.read_csv("../data/Hi-TpH-level-IV.csv")

level4_df = level4_df.head(3)   # cases

for col in ['alpha.v', 'alpha.j', 'beta.v', 'beta.j']:
    # 以 .imgt.gene 为准
    level4_df[col+'.imgt.gene'] = level4_df[col].apply(lambda x: seq_dict[x]['source gene'])
    level4_df[col+'.imgt.seq'] = level4_df[col].apply(lambda x: seq_dict[x]['amino acid sequence'])

level4_df['alpha.vseq.reconstructed'] = level4_df['alpha.v.imgt.seq']+'-'+level4_df['alpha.cdr3']+'-'+level4_df['alpha.j.imgt.seq']
level4_df['beta.vseq.reconstructed'] = level4_df['beta.v.imgt.seq']+'-'+level4_df['beta.cdr3']+'-'+level4_df['beta.j.imgt.seq']

print('Alpha')
for ind, row in level4_df.iterrows():
    alpha_vseq = reconstruct_vseq(row['alpha.vseq.reconstructed'])
    print(row['alpha.v.imgt.gene'], row['alpha.cdr3'], row['alpha.j.imgt.gene'])
    print(alpha_vseq, end='\n')

print('Beta')
for ind, row in level4_df.iterrows():
    beta_vseq = reconstruct_vseq(row['beta.vseq.reconstructed'])
    print(row['beta.v.imgt.gene'], row['beta.cdr3'], row['beta.j.imgt.gene'])
    print(beta_vseq, end='\n')

level4_df['alpha.vseq.reconstructed'] = level4_df['alpha.vseq.reconstructed'].apply(reconstruct_vseq)
level4_df['beta.vseq.reconstructed'] = level4_df['beta.vseq.reconstructed'].apply(reconstruct_vseq)

Alpha
TRAV26-1*01 IVVRSSNTGKLI TRAJ37*01
DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCIVVRSSNTGKLIFGQGTTLQVKP
TRAV26-1*01 IVVRSSNTGKLI TRAJ37*01
DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCIVVRSSNTGKLIFGQGTTLQVKP
TRAV21*01 AVRPLLDGTYIPT TRAJ6*01
KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTSLLLIQSSQREQTSGRLNASLDKSSGRSTLYIAASQPGDSATYLCAVRPLLDGTYIPTFGRGTSLIVHP
Beta
TRBV14*01 ASSQDRDTQY TRBJ2-3*01
EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSQDRDTQYFGPGTRLTVL
TRBV14*01 ASSQDRDTQY TRBJ2-3*01
EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSQDRDTQYFGPGTRLTVL
TRBV6-5*01 ASSYLGNTGELF TRBJ2-2*01
NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSYLGNTGELFFGEGSRLTVL
