In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob 

Load TCR-Seq output

In [2]:
TCR_files = glob.glob("../raw_data/TCR/*.tsv")

data = []

for file in TCR_files:
    dataframe = pd.read_csv(file,sep='\t')
    dataframe['sample'] = os.path.basename(file)
    data.append(dataframe)

TCR = pd.concat(data, sort=False)

In [3]:
# rename TCR sample names based on 1:1 matchup in python dictionary
Sample_name_matchup_RNA_TCR_dict = {
    "ESO1-sorted-T-cells_S13_L007" : "RearrangementDetails_ESO1_sorted_infusion.tsv",
    "INY1-sorted-T-cells_S14_L007" : "RearrangementDetails_INY1_sorted_infusion.tsv",
    "INY2-sorted-T-cells_S15_L007" : "RearrangementDetails_INY2_sorted_infusion.tsv",
    "TR2-PBMC_S12" : "RearrangementDetails_TR-PBMC.tsv",
    "SAR-11-14-12RNA_S1" : "SAR_11-14_PBMC.tsv",
    "MP-11-28-12RNA_S2" : "MP_11-28_PBMC.tsv",
    "CMT-baseline1C_CAGATC" : "Pt204_Baseline_TCR_seq.tsv",
    "HM-baseline1C_CGATGT" : "Pt310_baseline_TCRseq.tsv",
    "PT0310_S9" : "Pt310_on-tx_TCR_seq.tsv",
    "LEK-baseline_CGATGT" : "Pt294_baseline_TCR_seq.tsv",
    "LEK-OT110712A_CCGTCC" : "Pt294_on-tx_TCR_seq.tsv",
    "JSSBaseline-RNA_GTGAAA" : "Pt_308_baseline_TCR_seq.tsv",
    "RAS-baseline_TGACCA" : "Pt_325_baseline_TCR_seq.tsv",
    "PT0112-B_S3" : "Pt_112_baseline_TCR_seq.tsv",
    "PT0285-B_S5" : "Pt_285_baseline_TCR_seq.tsv"
}

for matchup in Sample_name_matchup_RNA_TCR_dict:
    TCR.loc[(TCR['sample'] == Sample_name_matchup_RNA_TCR_dict[matchup]), 'sample' ] = matchup

In [4]:
# rename and select the columns that needed for analysis
TCR.rename(columns = {'reads':'reads_tool','rearrangement':'nucleotide','v_resolved':'V','d_resolved':'D','j_resolved':'J'},inplace=True)
TCR = TCR[['reads_tool','V','J','sample']]
TCR['tool'] = "TCR-Seq"

# calculate frequency
samples = ['CMT-baseline1C_CAGATC' , 'ESO1-sorted-T-cells_S13_L007',
           'HM-baseline1C_CGATGT' , 'INY1-sorted-T-cells_S14_L007',
           'INY2-sorted-T-cells_S15_L007' , 'JSSBaseline-RNA_GTGAAA',
           'LEK-OT110712A_CCGTCC' , 'LEK-baseline_CGATGT' , 'MP-11-28-12RNA_S2',
           'PT0112-B_S3' , 'PT0285-B_S5' , 'PT0310_S9' , 'RAS-baseline_TGACCA',
           'SAR-11-14-12RNA_S1' , 'TR2-PBMC_S12']

TCR_frequency = pd.DataFrame()

for sample in samples:
    sample_frequency = TCR.loc[(TCR['sample'] == sample)]
    sample_frequency.loc[:,'frequency_tool'] = sample_frequency['reads_tool']/sum(sample_frequency['reads_tool'])
    TCR_frequency = pd.concat([sample_frequency,TCR_frequency],sort=False)

# select rows with reads greater than 1
TCR_frequency = TCR_frequency[TCR_frequency['reads_tool'] > 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [5]:
# filter out the unknown V genes and more than one V genes
TCR_V = TCR_frequency[(TCR_frequency['V'] != 'unresolved')&(TCR_frequency['V'] != 'unknown')]

# consider V gene only, not allele  
TCR_V.loc[:,'V'] = TCR_V['V'].str.replace("(-).*","")

# match the naming of gene 
TCR_V.loc[:,'V'] = TCR_V['V'].str.replace("TCRBV0","TRBV")
TCR_V.loc[:,'V'] = TCR_V['V'].str.replace("TCRBV","TRBV")

# calculate V gene usage 
TCR_V_frequency = TCR_V.groupby(['sample','V'], as_index=False)['frequency_tool'].agg({'frequency_tool':'sum'})
TCR_V_frequency['tool'] = "TCR-Seq"

In [6]:
# filter out the unknown J genes
TCR_J = TCR_frequency[(TCR_frequency['J'] != 'unresolved')&(TCR_frequency['J'] != 'unknown')]

# consider J gene only, not allele  
TCR_J.loc[:,'J'] = TCR_J['J'].str.replace("(-).*","")

# match the naming of gene 
TCR_J.loc[:,'J'] = TCR_J['J'].str.replace("TCRBJ0","TRBJ")
TCR_J.loc[:,'J'] = TCR_J['J'].str.replace("TCRBJ","TRBJ")

# calculate J gene usage 
TCR_J_frequency = TCR_J.groupby(['sample','J'], as_index=False)['frequency_tool'].agg({'frequency_tool':'sum'})
TCR_J_frequency['tool'] = "TCR-Seq"

Load MIXCR output

In [7]:
MIXCR_files = glob.glob("../raw_data/MIXCR/*.txt")

MIXCR = pd.DataFrame()
for sample in MIXCR_files:
    df_sample = pd.read_csv(sample, delimiter = "\t")
    for sample_name in Sample_name_matchup_RNA_TCR_dict.keys():
        if sample_name in sample:
            df_sample['sample'] = sample_name
    MIXCR = MIXCR.append(df_sample)

In [8]:
# rename and select the columns that needed for analysis
MIXCR['allVHitsWithScore']=MIXCR['allVHitsWithScore'].str.replace(r"\(.*\)","")
MIXCR['allDHitsWithScore']=MIXCR['allDHitsWithScore'].str.replace(r"\(.*\)","")
MIXCR['allJHitsWithScore']=MIXCR['allJHitsWithScore'].str.replace(r"\(.*\)","")
MIXCR.rename(columns = {'cloneFraction':'frequency_tool','cloneCount':'reads_tool','aaSeqCDR3':'amino_acid','targetSequences':'nucleotide','allVHitsWithScore':'V','allDHitsWithScore':'D','allJHitsWithScore':'J'},inplace=True)
MIXCR = MIXCR[['frequency_tool','reads_tool','V','J','sample']]
MIXCR['tool'] = "MIXCR"

# select TRB 
MIXCR = MIXCR[MIXCR['V'].str.contains("TRB") & MIXCR['J'].str.contains("TRB")]

# select rows with reads greater than 1
MIXCR = MIXCR[MIXCR['reads_tool'] > 1]

In [9]:
# consider V gene only, not allele  
MIXCR['V'] = MIXCR['V'].str.replace("(-).*","") 
MIXCR['V'] = MIXCR['V'].str.replace("\*.*","")

# calculate V gene usage 
MIXCR_V_frequency = MIXCR.groupby(['sample','V'], as_index=False)['frequency_tool'].agg({'frequency_tool':'sum'})
MIXCR_V_frequency['tool'] = "MIXCR"

In [10]:
# consider J gene only, not allele  
MIXCR['J'] = MIXCR['J'].str.replace("(-).*","")
MIXCR['J'] = MIXCR['J'].str.replace("\*.*","")

# calculate J gene usage 
MIXCR_J_frequency = MIXCR.groupby(['sample','J'], as_index=False)['frequency_tool'].agg({'frequency_tool':'sum'})
MIXCR_J_frequency['tool'] = "MIXCR"

Load TRUST4 output

In [11]:
TRUST4_files = glob.glob("../raw_data/TRUST4/*.tsv")

TRUST4 = pd.DataFrame()
for sample in TRUST4_files:
    df_sample = pd.read_csv(sample,sep='\t')
    for sample_name in Sample_name_matchup_RNA_TCR_dict.keys():
        if sample_name in sample:
            df_sample['sample'] = sample_name
    TRUST4 = TRUST4.append(df_sample)

In [12]:
# rename and select the columns that needed for analysis
TRUST4.rename(columns = {'frequency':'frequency_tool','#count':'reads_tool','CDR3aa':'amino_acid','CDR3nt':'nucleotide'},inplace=True)
TRUST4 = TRUST4[['frequency_tool','reads_tool','V','J','sample']]
TRUST4['tool'] = "TRUST4"

# select TRB 
TRUST4 = TRUST4[TRUST4['V'].str.contains("TRB") & TRUST4['J'].str.contains("TRB")]

# select rows with reads greater than 1
TRUST4 = TRUST4[TRUST4['reads_tool'] > 1]

In [13]:
# consider V gene only, not allele  
TRUST4['V'] = TRUST4['V'].str.replace("(-).*","") 
TRUST4['V'] = TRUST4['V'].str.replace("\*.*","")

# calculate V gene usage 
TRUST4_V_frequency = TRUST4.groupby(['sample','V'], as_index=False)['frequency_tool'].agg({'frequency_tool':'sum'})
TRUST4_V_frequency['tool'] = "TRUST4"

In [14]:
# consider J gene only, not allele  
TRUST4['J'] = TRUST4['J'].str.replace("(-).*","")
TRUST4['J'] = TRUST4['J'].str.replace("\*.*","")

# calculate J gene usage 
TRUST4_J_frequency = TRUST4.groupby(['sample','J'], as_index=False)['frequency_tool'].agg({'frequency_tool':'sum'})
TRUST4_J_frequency['tool'] = "TRUST4"

Load IMREP output

In [15]:
IMREP_files = glob.glob("../raw_data/IMREP/*.cdr3")

IMREP = pd.DataFrame()
for sample in IMREP_files:
    df_sample = pd.read_csv(sample)
    for sample_name in Sample_name_matchup_RNA_TCR_dict.keys():
        if sample_name in sample:
            df_sample['sample'] = sample_name
    IMREP = IMREP.append(df_sample)

In [16]:
# rename the columns 
IMREP.rename(columns = {'CDR3_AA_Seq':'amino_acid','Read_count':'reads_tool','V_chains':'V','D_chains':'D','J_chains':'J'},inplace=True)

# select TRB 
IMREP = IMREP[IMREP['Chain_type'] == 'TRB']

# calculate frequency
IMREP['frequency_tool'] = IMREP['reads_tool']/sum(IMREP['reads_tool'])

# select the columns that needed for analysis
IMREP = IMREP[['frequency_tool','reads_tool','V','J','sample']]
IMREP['tool'] = "IMREP"

# select rows with reads greater than 1
IMREP = IMREP[IMREP['reads_tool'] > 1]

# drop the rows with more than one inferred V, J genes
IMREP = IMREP[-IMREP['V'].str.contains(";")]
IMREP = IMREP[-IMREP['J'].str.contains(";")]
IMREP = IMREP[-IMREP['V'].str.contains("OR")]

In [17]:
# calculate V gene usage 
IMREP_V_frequency = IMREP.groupby(['sample','V'], as_index=False)['frequency_tool'].agg({'frequency_tool':'sum'})
IMREP_V_frequency['tool'] = "IMREP"

In [18]:
# calculate J gene usage 
IMREP_J_frequency = IMREP.groupby(['sample','J'], as_index=False)['frequency_tool'].agg({'frequency_tool':'sum'})
IMREP_J_frequency['tool'] = "IMREP"

Merge dataframe 

In [19]:
df_V = pd.concat([TCR_V_frequency,MIXCR_V_frequency,TRUST4_V_frequency,IMREP_V_frequency],sort=False)

# add sample type
df_V.loc[df_V['sample']=='CMT-baseline1C_CAGATC','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='ESO1-sorted-T-cells_S13_L007','sample_type'] = 'PBMC'
df_V.loc[df_V['sample']=='HM-baseline1C_CGATGT','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='INY1-sorted-T-cells_S14_L007','sample_type'] = 'PBMC'
df_V.loc[df_V['sample']=='INY2-sorted-T-cells_S15_L007','sample_type'] = 'PBMC'
df_V.loc[df_V['sample']=='JSSBaseline-RNA_GTGAAA','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='LEK-OT110712A_CCGTCC','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='LEK-baseline_CGATGT','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='MP-11-28-12RNA_S2','sample_type'] = 'PBMC'
df_V.loc[df_V['sample']=='PT0112-B_S3','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='PT0285-B_S5','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='PT0310_S9','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='RAS-baseline_TGACCA','sample_type'] = 'melanoma'
df_V.loc[df_V['sample']=='SAR-11-14-12RNA_S1','sample_type'] = 'PBMC'
df_V.loc[df_V['sample']=='TR2-PBMC_S12','sample_type'] = 'PBMC'

# add clonal type 
df_V.loc[df_V['sample']=='CMT-baseline1C_CAGATC','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='ESO1-sorted-T-cells_S13_L007','clonal_type'] = 'monoclonal'
df_V.loc[df_V['sample']=='HM-baseline1C_CGATGT','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='INY1-sorted-T-cells_S14_L007','clonal_type'] = 'monoclonal'
df_V.loc[df_V['sample']=='INY2-sorted-T-cells_S15_L007','clonal_type'] = 'monoclonal'
df_V.loc[df_V['sample']=='JSSBaseline-RNA_GTGAAA','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='LEK-OT110712A_CCGTCC','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='LEK-baseline_CGATGT','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='MP-11-28-12RNA_S2','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='PT0112-B_S3','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='PT0285-B_S5','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='PT0310_S9','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='RAS-baseline_TGACCA','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='SAR-11-14-12RNA_S1','clonal_type'] = 'polyclonal'
df_V.loc[df_V['sample']=='TR2-PBMC_S12','clonal_type'] = 'polyclonal'

df_V.to_csv('../summary_data/150bp/all_tools_TRB_V_usage.csv', index=False)

df_V

Unnamed: 0,sample,V,frequency_tool,tool,sample_type,clonal_type
0,CMT-baseline1C_CAGATC,TRBV1,0.000387,TCR-Seq,melanoma,polyclonal
1,CMT-baseline1C_CAGATC,TRBV10,0.069356,TCR-Seq,melanoma,polyclonal
2,CMT-baseline1C_CAGATC,TRBV11,0.034109,TCR-Seq,melanoma,polyclonal
3,CMT-baseline1C_CAGATC,TRBV12,0.002180,TCR-Seq,melanoma,polyclonal
4,CMT-baseline1C_CAGATC,TRBV13,0.004151,TCR-Seq,melanoma,polyclonal
...,...,...,...,...,...,...
152,TR2-PBMC_S12,TRBV30,0.000473,IMREP,PBMC,polyclonal
153,TR2-PBMC_S12,TRBV4,0.000560,IMREP,PBMC,polyclonal
154,TR2-PBMC_S12,TRBV5,0.001732,IMREP,PBMC,polyclonal
155,TR2-PBMC_S12,TRBV6,0.001110,IMREP,PBMC,polyclonal


In [20]:
df_J = pd.concat([TCR_J_frequency,MIXCR_J_frequency,TRUST4_J_frequency,IMREP_J_frequency],sort=False)

# add sample type
df_J.loc[df_J['sample']=='CMT-baseline1C_CAGATC','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='ESO1-sorted-T-cells_S13_L007','sample_type'] = 'PBMC'
df_J.loc[df_J['sample']=='HM-baseline1C_CGATGT','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='INY1-sorted-T-cells_S14_L007','sample_type'] = 'PBMC'
df_J.loc[df_J['sample']=='INY2-sorted-T-cells_S15_L007','sample_type'] = 'PBMC'
df_J.loc[df_J['sample']=='JSSBaseline-RNA_GTGAAA','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='LEK-OT110712A_CCGTCC','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='LEK-baseline_CGATGT','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='MP-11-28-12RNA_S2','sample_type'] = 'PBMC'
df_J.loc[df_J['sample']=='PT0112-B_S3','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='PT0285-B_S5','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='PT0310_S9','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='RAS-baseline_TGACCA','sample_type'] = 'melanoma'
df_J.loc[df_J['sample']=='SAR-11-14-12RNA_S1','sample_type'] = 'PBMC'
df_J.loc[df_J['sample']=='TR2-PBMC_S12','sample_type'] = 'PBMC'

# add clonal type 
df_J.loc[df_J['sample']=='CMT-baseline1C_CAGATC','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='ESO1-sorted-T-cells_S13_L007','clonal_type'] = 'monoclonal'
df_J.loc[df_J['sample']=='HM-baseline1C_CGATGT','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='INY1-sorted-T-cells_S14_L007','clonal_type'] = 'monoclonal'
df_J.loc[df_J['sample']=='INY2-sorted-T-cells_S15_L007','clonal_type'] = 'monoclonal'
df_J.loc[df_J['sample']=='JSSBaseline-RNA_GTGAAA','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='LEK-OT110712A_CCGTCC','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='LEK-baseline_CGATGT','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='MP-11-28-12RNA_S2','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='PT0112-B_S3','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='PT0285-B_S5','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='PT0310_S9','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='RAS-baseline_TGACCA','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='SAR-11-14-12RNA_S1','clonal_type'] = 'polyclonal'
df_J.loc[df_J['sample']=='TR2-PBMC_S12','clonal_type'] = 'polyclonal'


df_J.to_csv('../summary_data/150bp/all_tools_TRB_J_usage.csv', index=False)

df_J

Unnamed: 0,sample,J,frequency_tool,tool,sample_type,clonal_type
0,CMT-baseline1C_CAGATC,TRBJ1,0.439541,TCR-Seq,melanoma,polyclonal
1,CMT-baseline1C_CAGATC,TRBJ2,0.560148,TCR-Seq,melanoma,polyclonal
2,ESO1-sorted-T-cells_S13_L007,TRBJ1,0.041007,TCR-Seq,PBMC,monoclonal
3,ESO1-sorted-T-cells_S13_L007,TRBJ2,0.864367,TCR-Seq,PBMC,monoclonal
4,HM-baseline1C_CGATGT,TRBJ1,0.290288,TCR-Seq,melanoma,polyclonal
...,...,...,...,...,...,...
22,RAS-baseline_TGACCA,TRBJ2,0.000073,IMREP,melanoma,polyclonal
23,SAR-11-14-12RNA_S1,TRBJ1,0.000904,IMREP,PBMC,polyclonal
24,SAR-11-14-12RNA_S1,TRBJ2,0.000991,IMREP,PBMC,polyclonal
25,TR2-PBMC_S12,TRBJ1,0.004334,IMREP,PBMC,polyclonal
