In [30]:
import warnings

# Ignore specific warning by category
warnings.filterwarnings("ignore", category=FutureWarning)

In [31]:
import pandas as pd
import os
import fnmatch

'''
This code extract CDR3 amino acod sequences from ***MiXCR*** outputs.
In this code we extract CDR3 from chain B , chain variable can be changed.
Also we saved only CDR3 that start with amino acid C and end with amino scid F. 
'''


def extract_from_mixcr(mixcr_data_file_path):
    # Define the pattern to search for
    output_file_pattern = "*clones_TRB.tsv"
    chain='TRB'
    if 'rep_seq' in mixcr_data_file_path:
        mixcr_df = pd.DataFrame(columns=['Sample', 'CDR3', 'Count_REP_SEQ_MiXCR'])
    else: 
        mixcr_df = pd.DataFrame(columns=['Sample', 'CDR3', 'Count_MiXCR'])
    ### MiXCR
    for sample_folder in os.listdir(mixcr_data_file_path):
        folder_path = os.path.join(mixcr_data_file_path, sample_folder)
        # Check if it's a folder
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                # if TRB.tsv file exsit then extract cdr3
                if os.path.isfile(file_path) and fnmatch.fnmatch(file_name, output_file_pattern):
                    trb_file= pd.read_csv(file_path,'\t')
                    extract_cdr3_df=trb_file[['aaSeqCDR3','readCount','allJHitsWithScore']]
                    # Select 'aaSeqCDR3' values that start with 'C' and end with 'F'
                    extract_cdr3_df = trb_file[trb_file['aaSeqCDR3'].str.startswith('C') & trb_file['aaSeqCDR3'].str.endswith('F')]
                    # Drop rows with missing values in 'allCHitsWithScore' column
                    extract_cdr3_df = extract_cdr3_df.dropna(subset=['allJHitsWithScore'])
                    # filter results that has no TRB values in "allCHitsWithScore" column 
                    extract_cdr3_df = extract_cdr3_df[extract_cdr3_df['allJHitsWithScore'].str.contains(chain)]
                    for index, row in extract_cdr3_df[['aaSeqCDR3','readCount']].iterrows():
                        #  remove "_v_4" from the end of the string
                        ###new_row = {'Sample': sample_folder[:-4], 'CDR3':  row['aaSeqCDR3'], 'Count_MiXCR': row['readCount']}
                        if 'rep_seq' in mixcr_data_file_path:
                            # Convert the float values to integers to establish a uniform format consistent with other tools.
                            new_row = {'Sample': sample_folder[:-4], 'CDR3':  row['aaSeqCDR3'], 'Count_REP_SEQ_MiXCR': int(row['readCount'])}
                        else: 
                            new_row = {'Sample': sample_folder[:-4], 'CDR3':  row['aaSeqCDR3'], 'Count_MiXCR': int(row['readCount'])}
                        mixcr_df = mixcr_df.append(new_row, ignore_index=True)
    print(mixcr_df)
    return mixcr_df

##mixcr_data_file_path='/dsi/sbm/linoym/Benchmarking_RNASeq/mixcr/rna_seq_150bp_pe_no_editing'
mixcr_data_file_path='/dsi/sbm/linoym/Benchmarking_RNASeq/mixcr/rna_seq_75bp_se_no_editing'
mixcr_df = extract_from_mixcr(mixcr_data_file_path)

   Sample              CDR3 Count_MiXCR
0   15_S5   CASTPLPASSYEQYF           1
1    5_S5    CASKMDSYTGELFF           1
2    5_S5      CSVVGDRGGYTF           1
3   19_S9     CASSFDGKETQYF           1
4   16_S6    CASSPPTGSNEQFF           2
5   16_S6     CASRRTETYEQYF           1
6   16_S6   CASSSGTSTLETQYF           1
7   16_S6    CASSYSNQGSEAFF           1
8   16_S6      CASSRSGNEQFF           1
9   16_S6  CASSFDVPPLNTEAFF           1
10  16_S6       CASSLDTEAFF           1
11  16_S6   CASN*QG_PAHEQYF           1
12  16_S6    CASGDRGRTDTQYF           1
13  16_S6    CASSQDQSSYEQYF           1
14  16_S6    CSARDPPVNEKLFF           1
15  11_S1      CASSEGAREQFF           1
16  17_S7   CASSLALAEGDEQYF           2
17  17_S7    CASSAPPGFGGYTF           1
18   4_S4    CASSTGVVYNEQFF           1
19  18_S8   CASRYFLSSYNEQFF           1
20  18_S8  CSVKTQDISSYNEQFF           1
21  14_S4    CASSLELARETQYF           1


In [32]:
### TRUSRT4

import pandas as pd
import os
import fnmatch

'''
This code extract CDR3 amino acod sequences from ***TRUST4*** outputs.
In this code we extract CDR3 from chain B , chain variable can be changed.
Also we saved only CDR3 that start with amino acid C and end with amino scid F. 
'''


def extract_from_trust4(trust4_data_file_path):
    # Define the pattern to search for
    output_file_pattern = "*_report.tsv"
    chain='TRB'
    trust4_df = pd.DataFrame(columns=['Sample', 'CDR3', 'Count_TRUST4'])
    ### TRUST4
    for sample_folder in os.listdir(trust4_data_file_path):
        folder_path = os.path.join(trust4_data_file_path, sample_folder)
        # Check if it's a folder
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                # if TRB.tsv file exsit then extract cdr3
                if os.path.isfile(file_path) and fnmatch.fnmatch(file_name, output_file_pattern):
                    trb_file= pd.read_csv(file_path,'\t')
                    extract_cdr3_df=trb_file[['CDR3aa','#count']]
                    # Select 'CDR3aa' values that start with 'c' and end with 'f'
                    extract_cdr3_df = trb_file[trb_file['CDR3aa'].str.startswith('C') & trb_file['CDR3aa'].str.endswith('F')]
                    # filter results that has no TRB values in "allCHitsWithScore" column 
                    extract_cdr3_df = extract_cdr3_df[extract_cdr3_df['C'].str.contains(chain)]
                    for index, row in extract_cdr3_df.iterrows():
                        new_row = {'Sample': sample_folder, 'CDR3':  row['CDR3aa'], 'Count_TRUST4': row['#count']}
                        trust4_df = trust4_df.append(new_row, ignore_index=True)

    print(trust4_df)
    #print(trust4_df[trust4_df['Sample'].str.contains('S5')])
    return trust4_df

##trust4_data_file_path='/dsi/sbm/linoym/Benchmarking_RNASeq/trust4/rna_seq_150bp_pe_no_editing'
trust4_data_file_path='/dsi/sbm/linoym/Benchmarking_RNASeq/trust4/rna_seq_75bp_se_no_editing'
trust4_df = extract_from_trust4(trust4_data_file_path)


   Sample              CDR3 Count_TRUST4
0   19_S9     CASSFDGKETQYF            1
1   14_S4    CASSLELARETQYF            1
2   18_S8   CASRYFLSSYNEQFF            1
3   11_S1      CASSEGAREQFF            1
4   17_S7   CASSLALAEGDEQYF            5
5   17_S7    CASSLLTGSVEQYF            2
6   17_S7   CATSPGVSGANVLTF            1
7   17_S7   CASSTGNPGGDTQYF            1
8   17_S7    CASSAPPGFGGYTF            1
9   17_S7   CASSATSSRSYEQYF            1
10  16_S6    CASSPPTGSNEQFF            2
11  16_S6    CASSQDQSSYEQYF            2
12  16_S6  CATSRDEGLTYNEQFF            1
13  16_S6   CASSQEGQGENEQYF            1
14  16_S6  CASSFDVPPLNTEAFF            1
15  16_S6       CASSLDTEAFF            1
16  16_S6    CSARDPPVNEKLFF            1
17  16_S6      CASSRSGNEQFF            1
18  16_S6    CASGDRGRTDTQYF            1
19  16_S6    CASSYSNQGSEAFF            1
20  16_S6   CASSSGTSTLETQYF            1
21  16_S6     CASRRTETYEQYF            1
22  16_S6    CASTAPTSSYEQYF            1
23  16_S6   CASS

In [33]:
### MiXCR REP-SEQ
# REP-SEQ results can be compared only for 150 bp
if '150' in mixcr_data_file_path and '150' in trust4_data_file_path:
    rep_seq_data_file_path ='/dsi/sbm/linoym/Benchmarking_RNASeq/mixcr/rep_seq_mixcr'
    rep_seq_mixcr_df = extract_from_mixcr(rep_seq_data_file_path)

In [36]:
### JOIN df
# Perform left join on 'CDR3','Sample' columns
merged_df = pd.merge(mixcr_df, trust4_df, on=['CDR3','Sample'], how='outer')
# REP-SEQ results can be compared only for 150 bp
if '150' in mixcr_data_file_path and '150' in trust4_data_file_path:
    merged_df = pd.merge(merged_df, rep_seq_mixcr_df, on=['CDR3','Sample'], how='outer')

merged_df = merged_df.sort_values(by=['Sample','Count_MiXCR', 'Count_TRUST4'],ascending=[True, True, True])
# replace nan values with zero, because nan cell menan that the tool has not find the sequence.
merged_df = merged_df.fillna(0)
# save all in other df to the next section
all_data_df=merged_df
# Filter out rows where both 'Count_MiXCR' and 'Count_trust4' are equal to 0
merged_df = merged_df[(merged_df['Count_MiXCR'] != 0) | (merged_df['Count_TRUST4'] != 0)]
print(merged_df)
merged_df.to_csv('extract_cdr3_from_all_tools.csv', index=False)
#print(merged_df[merged_df['Sample'].str.contains('S4')])

   Sample              CDR3  Count_MiXCR  Count_TRUST4
15  11_S1      CASSEGAREQFF            1             1
21  14_S4    CASSLELARETQYF            1             1
0   15_S5   CASTPLPASSYEQYF            1             1
5   16_S6     CASRRTETYEQYF            1             1
6   16_S6   CASSSGTSTLETQYF            1             1
7   16_S6    CASSYSNQGSEAFF            1             1
8   16_S6      CASSRSGNEQFF            1             1
9   16_S6  CASSFDVPPLNTEAFF            1             1
10  16_S6       CASSLDTEAFF            1             1
12  16_S6    CASGDRGRTDTQYF            1             1
14  16_S6    CSARDPPVNEKLFF            1             1
13  16_S6    CASSQDQSSYEQYF            1             2
11  16_S6   CASN*QG_PAHEQYF            1             0
4   16_S6    CASSPPTGSNEQFF            2             2
26  16_S6  CATSRDEGLTYNEQFF            0             1
27  16_S6   CASSQEGQGENEQYF            0             1
28  16_S6    CASTAPTSSYEQYF            0             1
29  16_S6 

In [37]:
print('########## FOR TABLE 4 #########')
# Find unique CDR3 for  MiXCR
# For example, if the sequence CDR3: CA....FF appears 10 times in Sample X, we count it as one unique CDR3.
unique_count_rep_seq_df = all_data_df [(all_data_df['Count_MiXCR'] != 0)]
unique_count_rep_seq_df = unique_count_rep_seq_df.groupby('Sample')['CDR3'].nunique()
print('MiXCR')
print(unique_count_rep_seq_df)
print()

# Find unique CDR3 for TRUST4
unique_count_rep_seq_df = all_data_df [(all_data_df['Count_TRUST4'] != 0)]
unique_count_rep_seq_df = unique_count_rep_seq_df.groupby('Sample')['CDR3'].nunique()
print('TRUST4')
print(unique_count_rep_seq_df)
print()

if '150' in mixcr_data_file_path and '150' in trust4_data_file_path:
    # Find unique CDR3 for REP-seq
    unique_count_rep_seq_df = all_data_df [(all_data_df['Count_REP_SEQ_MiXCR'] != 0)]
    unique_count_rep_seq_df = unique_count_rep_seq_df.groupby('Sample')['CDR3'].nunique()
    print('REP_SEQ_MiXCR')
    print(unique_count_rep_seq_df)
    print()

########## FOR TABLE 3 #########
MiXCR
Sample
11_S1     1
14_S4     1
15_S5     1
16_S6    11
17_S7     2
18_S8     2
19_S9     1
4_S4      1
5_S5      2
Name: CDR3, dtype: int64

TRUST4
Sample
11_S1     1
14_S4     1
15_S5     1
16_S6    15
17_S7     6
18_S8     1
19_S9     1
4_S4      3
5_S5      3
Name: CDR3, dtype: int64

