In [1]:
import numpy as np  # arithmetic operations
import pandas as pd  # handling dataframes
from Bio.Seq import translate  # protein translation


In [2]:

def adjust_reading_frame(row, mut_seq):
    """
    Adjusts the reading frame of a given sequence based on a mutation sequence.

    Parameters:
    row (str): The sequence to be adjusted.
    mut_seq (str): The mutation sequence containing fragments separated by 'NNK'.

    Returns:
    str: The adjusted sequence if a unique reading frame is found, None if no frame is found.
    
    Raises:
    Exception: If multiple possible reading frames are found.
    """
    # Split the mut_seq into fragments separated by 'NNK'
    fragments = mut_seq.split('NNK')
    
    # Initialize list to store possible indices
    possible_indices = []
    
    # Initialize the current length of matched sequence
    length = 0
    
    for fragment in fragments:
        # Skip empty fragments
        if not fragment:
            length += 3  # Move length forward by 3 for each 'NNK'
            continue

        # Find all occurrences of the fragment in the row
        temp_indices = []
        start = 0
        while True:
            index = row.find(fragment, start)
            if index == -1:
                break
            temp_indices.append(index - length)
            start = index + 3  # Move start forward by 3 for each 'NNK'

        if not possible_indices:
            possible_indices = temp_indices
        else:
            possible_indices = [idx for idx in possible_indices if idx in temp_indices]
        
        length += len(fragment) + 3  # Update length to include fragment and 'NNK'

    if len(possible_indices) == 1:
        index = possible_indices[0]
        return row[index : index + len(mut_seq)]
    elif len(possible_indices) == 0:
        return None
    else:
        raise Exception('Enter longer sequence')


In [3]:

def translate_mutations(row, mut_seq):
    """
    Translates mutations in a given sequence based on a mutation sequence.

    Parameters:
    row (str): The sequence to be translated.
    mut_seq (str): The mutation sequence containing fragments separated by 'NNK'.

    Returns:
    str: The translated mutation sequence, or 'bad sequencing' if the input sequence is invalid.
    """
    if not row:
        return 'bad sequencing'
    
    split_seq = mut_seq.split('NNK')
    length = 0
    nnk_translation = ""
    
    for fragment in split_seq[:-1]:
        length += len(fragment)
        codon = row[length:length+3]
        if codon:
            nnk_translation += translate(codon)
        length += 3
    
    return nnk_translation


In [4]:

def find_recording_frame(row, rec_seq):
    """
    Finds the recording frame of a given sequence based on a recording region sequence.

    Parameters:
    row (str): The sequence to be examined.
    rec_seq (str): The recording region sequence containing fragments separated by 'C'.

    Returns:
    str: The substring of `row` that matches the recording region sequence if a unique match is found, None if no match is found.
    
    Raises:
    Exception: If multiple possible recording frames are found.
    """
    # Split rec_seq into fragments separated by 'C'
    fragments = rec_seq.split('C')
    
    # Calculate the constant number of 'C's and 'T's in the recording region
    num_CT = len(fragments) + rec_seq.count('T') - 1
    
    # Initialize list to store possible indices
    possible_indices = []
    
    # Initialize the current length of matched sequence
    length = 0
    
    for fragment in fragments:
        # Skip empty fragments
        if not fragment:
            length += 1  # Move length forward by 1 for each 'C'
            continue
        
        # Find all occurrences of the fragment in the row
        temp_indices = []
        start = 0
        while True:
            index = row.find(fragment, start)
            if index == -1:
                break
            temp_indices.append(index - length)
            start = index + len(fragment)  # Move start forward by the length of the fragment
        
        if not possible_indices:
            possible_indices = temp_indices
        else:
            possible_indices = [idx for idx in possible_indices if idx in temp_indices]
        
        length += len(fragment) + 1  # Update length to include fragment and 'C'

    # Remove indices that do not have the correct number of 'C's and 'T's
    possible_indices = [idx for idx in possible_indices if 
                        row[idx:idx + len(rec_seq)].count('C') + row[idx:idx + len(rec_seq)].count('T') == num_CT]

    if len(possible_indices) == 1:
        index = possible_indices[0]
        return row[index : index + len(rec_seq)]
    elif len(possible_indices) == 0:
        return None
    else:
        raise Exception('Enter longer sequence')


In [5]:

def main(df, filename, targetSeq, recording='CCACCCGCaaaa'):
    """
    Processes mutation data and records it to CSV files.

    Parameters:
    filename (str): The name of the output CSV file.
    targetSeq (str): The target sequence to process.
    recording (str, optional): The recording region sequence. Default is 'CCACCCGCaaaa'.
    """
    mutStrand = targetSeq.upper()
    
    # Remove dashes and convert to uppercase
    df.loc[:, 'FixedSequence'] = df['TargetSequence'].apply(lambda seq: seq.replace("-", "").upper())
    # Adjust the reading frame according to the mutation strand
    df.loc[:, 'Mutation Reading Frame'] = df['FixedSequence'].apply(lambda x: adjust_reading_frame(x, mutStrand))
    # Translate mutations based on the reading frame and mutation strand
    df.loc[:, 'Mutations'] = df['Mutation Reading Frame'].apply(lambda x: translate_mutations(x, mutStrand))

    recRegion = recording.upper()
    # Find the recording frame within the fixed sequence
    df.loc[:, 'Recording'] = df['FixedSequence'].apply(lambda x: find_recording_frame(x, recRegion))

    # Select relevant columns for mutation and recording
    df_mut_rec = df[['TargetSequence', 'Mutations', 'Recording', 'Reads', 'Type']]
    df_mut_rec.to_csv('2NNK_Mut_Rec_1.csv')

    # Filter sequences marked as 'bad sequencing'
    df_bad_seq = df[df['Mutations'] == 'bad sequencing']
    df_bad_seq.to_csv('2NNK_bad_1.csv')

    # Group data by mutations and sum the reads
    df0 = df[['Mutations', 'Recording', 'Reads']]
    grouped_df = df0.groupby('Mutations')['Reads'].sum().reset_index()
    grouped_df.rename(columns={'Reads': 'Reads sum'}, inplace=True)

    # Filter out 'bad sequencing', matching recording regions, and NaN recordings
    df1 = df0[(df['Mutations'] != 'bad sequencing') & (df['Recording'] != recRegion) & df['Recording'].notna()]

    # Calculate T count and its squared value in the recording sequence
    df1 = df1.assign(T_count=df1['Recording'].apply(lambda seq: seq.count("T")),
                     T_squared=lambda x: x['T_count'] ** 2,
                     T_total=lambda x: x['T_count'] * x['Reads'],
                     T_squared_tot=lambda x: x['T_squared'] * x['Reads'])

    # Group by mutations and calculate sum of T_total and T_squared_total
    grouped_df_T = df1.groupby('Mutations')['T_total'].sum().reset_index()
    grouped_df_T.rename(columns={'T_total': 'T_sum'}, inplace=True)
    grouped_df_T2 = df1.groupby('Mutations')['T_squared_tot'].sum().reset_index()
    grouped_df_T2.rename(columns={'T_squared_tot': 'T^2_sum'}, inplace=True)
    grouped_df_edit = df1.groupby('Mutations')['Reads'].sum().reset_index()
    grouped_df_edit.rename(columns={'Reads': 'Edit reads'}, inplace=True)
    
    # Merge grouped data into a single DataFrame
    merged = pd.merge(grouped_df_edit, grouped_df_T, on='Mutations', how='left')
    merged = pd.merge(merged, grouped_df_T2, on='Mutations', how='left')
    merged = pd.merge(grouped_df, merged, on='Mutations', how='left')
    # Save the merged DataFrame to a CSV file
    merged.to_csv(filename)
    
    # Print the DataFrames for inspection
    print("Part of 2NNK_Mut_Rec.csv:")
    print(df_mut_rec.head())
    print("\nPart of 2NNK_bad.csv:")
    print(df_bad_seq.head())
    print(f"\nPart of {filename}:")
    print(merged.head())


In [6]:

# if file format is '.xlsx'
input_filename = 'Plate1_abundance.xlsx'
file_obj = pd.ExcelFile(input_filename)
df = pd.read_excel(file_obj, 'Sheet1', keep_default_na=False, na_values=['_'])

# if file format is '.csv'
#df = pd.read_csv("data/4NNK.csv")

# Execute the main function
output_filename = '2NNK_summary_1.csv'
target_seq = 'gtcgaagagNNKNNKggcaacaaac'
main(df, output_filename, target_seq)


Part of 2NNK_Mut_Rec.csv:
                                      TargetSequence Mutations     Recording  \
0  ATAGAAGTATTTTAATGCTTCCCGAAGAAGTCGAAGAGGTGATTGG...        VI  TTATTTGCAAAA   
1  ATAGAAGTATTTTAATGCTTCCCGAAGAAGTCGAAGAGGTGATTGG...        VI  CCACCCGCAAAA   
2  ATAGAAGTATTTTAATGCTTCCCGAAGAAGTCGAAGAGGTGATTGG...        VI  TTATTCGCAAAA   
3  ATAGAAGTATTTTAATGCTTCCCGAAGAAGTCGAAGAGcgGTG--G...        RW  TTATTCGCAAAA   
4  ATAGAAGTATTTTAATGCTTCCCGAAGAAGTCGAAGAGGTGATTGG...        VI  CCATTCGCAAAA   

   Reads                    Type  
0     37          5 Base Changes  
1     33                      WT  
2     26          4 Base Changes  
3     20  Insertion and Deletion  
4     19          2 Base Changes  

Part of 2NNK_bad.csv:
                                         TargetSequence  Reads  AvgQScore  \
675   ATAGAAGTATTTTAATGCTTCCCGAAGAAGTCGAAGAGCAG-TTGg...      5         32   
775   ATAGAAGTATTTTAATGCTTCCCGAAGAAGTCGAAGAGG-----GG...      5         32   
954   ATAGAAGTATTTTAATGCTTCCC

In [7]:
import pandas.testing as pdt

In [8]:
table1 = pd.read_csv('2NNK_summary.csv')
table2 = pd.read_csv('2NNK_summary_1.csv')

try:
    pdt.assert_frame_equal(table1, table2)
    print("The DataFrames are equal.")
except AssertionError as e:
    print("The DataFrames are not equal:")
    print(e)


The DataFrames are equal.


In [9]:
table1 = pd.read_csv('2NNK_Mut_Rec.csv')
table2 = pd.read_csv('2NNK_Mut_Rec_1.csv')

try:
    pdt.assert_frame_equal(table1, table2)
    print("The DataFrames are equal.")
except AssertionError as e:
    print("The DataFrames are not equal:")
    print(e)


The DataFrames are equal.


In [10]:
table1 = pd.read_csv('2NNK_bad.csv')
table2 = pd.read_csv('2NNK_bad_1.csv')

try:
    pdt.assert_frame_equal(table1, table2)
    print("The DataFrames are equal.")
except AssertionError as e:
    print("The DataFrames are not equal:")
    print(e)


The DataFrames are equal.
