In [3]:
import pandas as pd
import seaborn as sns

IMREP_df = pd.read_csv("../summary_data/complete_sample/IMREP_merged_extracted_features.csv")
TRUST4_df = pd.read_csv("../summary_data/complete_sample/TRUST4_merged_extracted_features.csv")
MIXCR_df = pd.read_csv("../summary_data/complete_sample/MIXCR_merged_extracted_features.csv")
TCR_df = pd.read_csv("../summary_data/complete_sample/TCR_merged_extracted_features.csv")


# Select Rows with nReads greater than 1
IMREP_df = IMREP_df[IMREP_df['nReads'] > 1]
TRUST4_df = TRUST4_df[TRUST4_df['nReads'] > 1]
MIXCR_df = MIXCR_df[MIXCR_df['nReads'] > 1]
TCR_df = TCR_df[TCR_df['nReads'] > 1]


# Calculate frequency of CDR3 sequence with respect to CDR3s that occur more than once.
IMREP_df["Frequency"] = IMREP_df['nReads'] / (IMREP_df['nReads'].sum() * 1.0)
TRUST4_df["Frequency"] = TRUST4_df['nReads'] / (TRUST4_df['nReads'].sum() * 1.0)
MIXCR_df["Frequency"] = MIXCR_df['nReads'] / (MIXCR_df['nReads'].sum() * 1.0)
TCR_df["Frequency"] = TCR_df['nReads'] / (TCR_df['nReads'].sum() * 1.0)  


# rename nReads and frequencies columns according to tool for proper merging.
IMREP_df = IMREP_df.rename(columns={"nReads": "nReads_IMREP", "Frequency": "Frequency_IMREP"})
TRUST4_df = TRUST4_df.rename(columns={"nReads": "nReads_TRUST4", "Frequency": "Frequency_TRUST4"})
MIXCR_df = MIXCR_df.rename(columns={"nReads": "nReads_MIXCR", "Frequency": "Frequency_MIXCR"})
TCR_df = TCR_df.rename(columns={"nReads": "nReads_TCR", "Frequency": "Frequency_TCR"})

# IMREP_TRUST4_MIXCR_sample_names = ['CMT-baseline1C_CAGATC' , 'ESO1-sorted-T-cells_S13_L007',
#                                    'HM-baseline1C_CGATGT' , 'INY1-sorted-T-cells_S14_L007',
#                                    'INY2-sorted-T-cells_S15_L007' , 'JSSBaseline-RNA_GTGAAA',
#                                    'LEK-OT110712A_CCGTCC' , 'LEK-baseline_CGATGT' , 'MP-11-28-12RNA_S2',
#                                    'PT0112-B_S3' , 'PT0285-B_S5' , 'PT0310_S9' , 'RAS-baseline_TGACCA',
#                                    'SAR-11-14-12RNA_S1' , 'TR2-PBMC_S12']


# TCR_SAMPLE_NAMES = ['MP_11-28_PBMC' , 'Pt204_Baseline_TCR_seq' , 'Pt294_baseline_TCR_seq', 
#                     'Pt294_on-tx_TCR_seq' , 'Pt310_baseline_TCRseq' , 'Pt310_on-tx_TCR_seq',
#                     'Pt_112_baseline_TCR_seq' , 'Pt_285_baseline_TCR_seq', 'Pt_308_baseline_TCR_seq',
#                     'Pt_325_baseline_TCR_seq', 'RearrangementDetails_ESO1_sorted_infusion',
#                     'RearrangementDetails_INY1_sorted_infusion', 'RearrangementDetails_INY2_sorted_infusion',
#                     'RearrangementDetails_TR-PBMC' , 'SAR_11-14_PBMC']


# print(TCR_df["Sample"].unique())

# print(MIXCR_df["Sample"].unique())

TCR_df.head()


Unnamed: 0,Sample,CDR3,nReads_TCR,Frequency_TCR
0,MP_11-28_PBMC,CAISENVLYGYTF,2,2.172569e-07
2,MP_11-28_PBMC,CASSEAQGFEQYF,2,2.172569e-07
10,MP_11-28_PBMC,CASSSPLGRYEQYF,3,3.258854e-07
13,MP_11-28_PBMC,CASSSTSGSPDTQYF,3,3.258854e-07
16,MP_11-28_PBMC,CSAPWQGGEKLFF,6,6.517707e-07


In [None]:
Sample_name_matchup_RNA_TCR_dict = {
    "ESO1-sorted-T-cells_S13_L007" : "RearrangementDetails_ESO1_sorted_infusion",
    "INY1-sorted-T-cells_S14_L007" : "RearrangementDetails_INY1_sorted_infusion",
    "INY2-sorted-T-cells_S15_L007" : "RearrangementDetails_INY2_sorted_infusion",
    "TR2-PBMC_S12" : "RearrangementDetails_TR-PBMC",
    "SAR-11-14-12RNA_S1" : "SAR_11-14_PBMC",
    "MP-11-28-12RNA_S2" : "MP_11-28_PBMC",
    "CMT-baseline1C_CAGATC" : "Pt204_Baseline_TCR_seq",
    "HM-baseline1C_CGATGT" : "Pt310_baseline_TCRseq",
    "PT0310_S9" : "Pt310_on-tx_TCR_seq",
    "LEK-baseline_CGATGT" : "Pt294_baseline_TCR_seq",
    "LEK-OT110712A_CCGTCC" : "Pt294_on-tx_TCR_seq",
    "JSSBaseline-RNA_GTGAAA" : "Pt_308_baseline_TCR_seq",
    "RAS-baseline_TGACCA" : "Pt_325_baseline_TCR_seq",
    "PT0112-B_S3" : "Pt_112_baseline_TCR_seq",
    "PT0285-B_S5" : "Pt_285_baseline_TCR_seq"
}

In [4]:
# Merge dataframes based on two key combination: Sample and CDR3 combination. Outer join ensures no data is 
# lost for instances that do not have overlap.

merge_IMREP_TRUST4 = pd.merge(IMREP_df, TRUST4_df, how='outer', on=['Sample', 'CDR3']).fillna(0)
merge_IMREP_TRUST4_MIXCR = pd.merge(MIXCR_df, merge_IMREP_TRUST4, how='outer', on=['Sample', 'CDR3']).fillna(0)
# merge_IMREP_TRUST4_MIXCR_TCR = pd.merge(TCR_df, merge_IMREP_TRUST4_MIXCR, how='outer', on=['Sample', 'CDR3']).fillna(0)

merge_IMREP_TRUST4_MIXCR.head()

Unnamed: 0,Sample,CDR3,nReads_MIXCR,Frequency_MIXCR,nReads_IMREP,Frequency_IMREP,nReads_TRUST4,Frequency_TRUST4
0,CMT-baseline1C_CAGATC,CQSYDRSLSGWVF,5.0,7e-06,11.0,1.2e-05,10.0,7e-06
1,CMT-baseline1C_CAGATC,CQQSYSRLYTF,4.0,5e-06,0.0,0.0,8.0,5e-06
2,CMT-baseline1C_CAGATC,CQQYHNWPPWAF,9.0,1.2e-05,2.0,2e-06,25.0,1.7e-05
3,CMT-baseline1C_CAGATC,CSSYAGSNNFVF,3.0,4e-06,0.0,0.0,6.0,4e-06
4,CMT-baseline1C_CAGATC,CHRYTSK_**GKFF,2.0,3e-06,0.0,0.0,0.0,0.0
