# lookingIntoReadTerminalEnds.ipynb
## Marcus Viscardi,    November 01, 2022

General goal here is to take the assigned reads from 221031_RNAStds_Nano3P_Assignments.csv and assess how accurate those assignments are!

In [2]:
import mappy as mp

import sys
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
from nanoporePipelineCommon import *

import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

print("imports done.")

imports done.


In [16]:
import_csv = "221109_RNAStds_Nano3P_Assignments.trimmed.csv"
raw_df = pd.read_csv(import_csv)

In [4]:
path_to_std_ref_fasta = "220902_version2.0_releventSequences_wOutTails.fasta"
stds_ref_dict = {}
from tqdm import tqdm
read_iterator = tqdm(mp.fastx_read(path_to_std_ref_fasta, read_comment=True),
                         desc=f"Reading fastx entries and assigning to standards"
                         )
for read_id, sequence, _, comments in read_iterator:
    cutdown_read_id = read_id[-6:-4]
    stds_ref_dict[cutdown_read_id] = sequence

def print_mappy_hit_alignment_for_stds(read_id,
                                       cigar_str,
                                       r_st, r_en,
                                       q_st, q_en,
                                       strand,
                                       read_seq: str,
                                       std_assignment: str,
                                       stds_ref_dict,
                                       line_print_width=None) -> None:
    ref_seq = stds_ref_dict[std_assignment].upper()
    print()
    print(f"read_id={read_id}; assignment={std_assignment}")
    import re
    parsed_cigar = re.findall(rf'(\d+)([MDNSIX])', cigar_str)
    parsed_cigar = [(int(num), char) for num, char in parsed_cigar]
    ref_seq = ref_seq[r_st: r_en]
    ref_pos = 0
    read_seq = read_seq[q_st: q_en]
    read_pos = 0
    if strand == "-":
        read_seq = mp.revcomp(read_seq)
    top_line = ""
    middle_line = ""
    bottom_line = ""
    for length, code in parsed_cigar:
        if code == "M":  # Map (Read & Ref Match)
            read_map_piece = read_seq[read_pos:read_pos + length]
            ref_map_piece = ref_seq[ref_pos:ref_pos + length]
            perfect_matches = ""
            for index, char in enumerate(read_map_piece):
                try:
                    if char == ref_map_piece[index]:
                        perfect_matches += "|"
                    else:
                        perfect_matches += "•"
                except IndexError:
                    perfect_matches += " "
            top_line += read_map_piece
            middle_line += perfect_matches
            bottom_line += ref_map_piece
            ref_pos += length
            read_pos += length
        elif code == "I":  # Insert (Gap in Ref)
            top_line += read_seq[read_pos:read_pos + length]
            middle_line += " " * length
            bottom_line += " " * length
            read_pos += length
        elif code == "D" or code == "N":  # Delete (Gap in Read)
            top_line += " " * length
            middle_line += " " * length
            bottom_line += ref_seq[ref_pos:ref_pos + length]
            ref_pos += length
    if isinstance(line_print_width, int):
        num_blocks = int(np.ceil(len(top_line) / line_print_width))
        print_blocks = []
        for block_index in range(num_blocks):
            print_blocks.append([
                top_line[block_index * line_print_width:(block_index + 1) * line_print_width],
                middle_line[block_index * line_print_width:(block_index + 1) * line_print_width],
                bottom_line[block_index * line_print_width:(block_index + 1) * line_print_width],
            ])
        for top, mid, bot in print_blocks:
            print()
            print(f"Read: {top}")
            print(f"      {mid}")
            print(f"Ref:  {bot}")
    else:
        print(f"Read: {top_line}", f"      {middle_line}", f"Ref:  {bottom_line}", sep='\n')

Reading fastx entries and assigning to standards: 6it [00:00, 18752.48it/s]


In [5]:
df = raw_df.copy(deep=True)

In [19]:
def mappy_cols_to_cols(mappy_obj_column):
    """
    The main reason to have this as its own script is to handle the errors that come from failed splits (which result from failed maps!)
    
    :param mappy_obj_column: 
    :return: 
    """
    if isinstance(mappy_obj_column, str):
        return_values = mappy_obj_column.split("\t")
    else:
        return_values = [None for _ in range(13)]
    if len(return_values) == 13:
        return return_values
    else:
        print(return_values)

def check_for_perfect_matches(rc_barcode_dict, **row):
    assignment = row['assignment']
    sequence = row['sequence']
    barcode = rc_barcode_dict[assignment]
    perfect_barcode = barcode in sequence
    return perfect_barcode

# Mapping info
tqdm.pandas(desc=f'Extracting mapping information')
df[[
    'q_st',
    'q_en',
    'strand',
    'ctg',
    'ctg_len',
    'r_st',
    'r_en',
    'mlen',
    'blen',
    'mapq',
    'tp', 'ts',
    'cigar',
]] = df.progress_apply(lambda row: mappy_cols_to_cols(row['mappy_hit_obj']), axis=1, result_type='expand')

# Isolate misses
miss_df = df[~df.assignment.isin(['00', '05', '10', '15', '30', '60'])]

# Isolate hits
hit_df = df[df.assignment.isin(['00', '05', '10', '15', '30', '60'])]
hit_df['cleaned_cigar'] = hit_df.cigar.str.rsplit(":", 1).str[1]
hit_df = hit_df.astype({"r_st": int,
                        "r_en": int,
                        "q_st": int,
                        "q_en": int,
                        })

# Perfect barcodes among hits
barcode_dict = {
'00': 'GGTGTTGTT',
'05': 'CGGCAATAA',
'10': 'TAATCGTCC',
'15': 'CCTTCTAGG',
'30': 'ACACACACC',
'60': 'AAGAGGAGG',
}
rc_barcode_dict = {key: mp.revcomp(value) for key, value in barcode_dict.items()}

tqdm.pandas(desc='Checking for perfect barcode matches')
hit_df['perfect_barcode'] = hit_df.progress_apply(lambda row: check_for_perfect_matches(rc_barcode_dict, **row), axis=1)

# Make a merged dataframe that actual has all the contained information spaced out!
new_df = pd.concat([hit_df, miss_df])
new_df.rename(columns={"Unnamed: 0": "read_id"}, inplace=True)
new_df.drop(columns=["mappy_hit_obj"], inplace=True)
new_df.fillna(pd.NA, inplace=True)

# Show hit_df b/c that's the one I really care about lol
hit_df

Extracting mapping information: 100%|██████████| 233693/233693 [00:10<00:00, 22138.41it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hit_df['cleaned_cigar'] = hit_df.cigar.str.rsplit(":", 1).str[1]
Checking for perfect barcode matches: 100%|██████████| 193100/193100 [00:08<00:00, 22373.64it/s]


                                read_id assignment                                           sequence q_st q_en strand                        ctg ctg_len r_st r_en mlen blen mapq      tp      ts  \
2  ce3360d6-89ac-44fd-8182-00369a35f346         60  TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...  219  723      -  ENO2_finalStandard_60Tail     662  117  653  472  532    1  tp:A:P  ts:A:.   
3  cd9bc58b-1f35-4b34-9462-0c1e13410cde         60  TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAG...   44  563      -  ENO2_finalStandard_60Tail     662  127  662  485  536   60  tp:A:P  ts:A:.   
4  9c62cb37-e358-4e6b-87a6-608324c0d44c         15  TTTTTTTTAGCACAGCAGCACCCTAGAAGGGCTGAGCTGATTTTAC...    8  522      -  ENO2_finalStandard_15Tail     662  127  657  488  530   36  tp:A:P  ts:A:.   
5  089403b0-5366-4327-8bea-212fd9670aa3         60  TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...   58  559      -  ENO2_finalStandard_60Tail     662  141  662  461  518   27  tp:A:P  ts:A:.   
6  cfa66ca

Unnamed: 0.1,Unnamed: 0,assignment,sequence,mappy_hit_obj,q_st,q_en,strand,ctg,ctg_len,r_st,r_en,mlen,blen,mapq,tp,ts,cigar,cleaned_cigar,perfect_barcode
2,ce3360d6-89ac-44fd-8182-00369a35f346,60,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,219\t723\t-\tENO2_finalStandard_60Tail\t662\t1...,219,723,-,ENO2_finalStandard_60Tail,662,117,653,472,532,1,tp:A:P,ts:A:.,cg:Z:83M1D33M3D9M3D12M1I49M1D12M2I1M1D18M1D7M1...,83M1D33M3D9M3D12M1I49M1D12M2I1M1D18M1D7M1I15M1...,False
3,cd9bc58b-1f35-4b34-9462-0c1e13410cde,60,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAG...,44\t563\t-\tENO2_finalStandard_60Tail\t662\t12...,44,563,-,ENO2_finalStandard_60Tail,662,127,662,485,536,60,tp:A:P,ts:A:.,cg:Z:8M1D4M1D21M2I64M1I65M1D58M1I1M2I23M2D25M1...,8M1D4M1D21M2I64M1I65M1D58M1I1M2I23M2D25M1D2M1D...,True
4,9c62cb37-e358-4e6b-87a6-608324c0d44c,15,TTTTTTTTAGCACAGCAGCACCCTAGAAGGGCTGAGCTGATTTTAC...,8\t522\t-\tENO2_finalStandard_15Tail\t662\t127...,8,522,-,ENO2_finalStandard_15Tail,662,127,657,488,530,36,tp:A:P,ts:A:.,cg:Z:54M3D11M2D50M4D1M2D125M3D1M1D63M1I4M1D5M1...,54M3D11M2D50M4D1M2D125M3D1M1D63M1I4M1D5M1I7M4D...,True
5,089403b0-5366-4327-8bea-212fd9670aa3,60,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,58\t559\t-\tENO2_finalStandard_60Tail\t662\t14...,58,559,-,ENO2_finalStandard_60Tail,662,141,662,461,518,27,tp:A:P,ts:A:.,cg:Z:29M1D17M6D83M1D2M2D51M2D50M3I9M1D2M1D49M2...,29M1D17M6D83M1D2M2D51M2D50M3I9M1D2M1D49M2D19M3...,True
6,cfa66ca1-8e2e-43b1-bb6c-6391c5f47084,60,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,49\t592\t-\tENO2_finalStandard_60Tail\t662\t10...,49,592,-,ENO2_finalStandard_60Tail,662,104,662,512,557,5,tp:A:P,ts:A:.,cg:Z:57M1D2M2I2M2D50M1D5M1D31M1D10M8D33M2D29M3...,57M1D2M2I2M2D50M1D5M1D31M1D10M8D33M2D29M3D23M1...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233688,cdc54287-5724-470c-8459-fe12e96846f5,15,CAACTTTTTTTTTTTTTTTTTTTAGCACAGCACAGCACAGCACCTA...,23\t656\t-\tENO2_finalStandard_15Tail\t662\t30...,23,656,-,ENO2_finalStandard_15Tail,662,30,662,595,631,46,tp:A:P,ts:A:.,cg:Z:80M1D8M1I29M1I5M3D89M1I1M1I45M1D67M1I33M2...,80M1D8M1I29M1I5M3D89M1I1M1I45M1D67M1I33M2I248M...,True
233689,c09242cf-9e03-4169-9fba-9159a2527346,15,TTTTTTTTTTTTTTTAGCACAGCACAGCACAGCACCTAGAAGGGCT...,15\t558\t-\tENO2_finalStandard_15Tail\t662\t11...,15,558,-,ENO2_finalStandard_15Tail,662,117,662,500,542,44,tp:A:P,ts:A:.,cg:Z:45M2D6M1D243M1I5M1I12M2I1M1D14M1I82M1D34M...,45M2D6M1D243M1I5M1I12M2I1M1D14M1I82M1D34M1D75M...,True
233690,e1fe0af7-f616-41fc-ad2b-f182d2a833a4,10,GTTTTAACTGGCAGCAGAGCACGGACGATAGCTGAGCTGAGTCGGT...,11\t470\t-\tENO2_finalStandard_10Tail\t662\t17...,11,470,-,ENO2_finalStandard_10Tail,662,176,653,409,486,10,tp:A:P,ts:A:.,cg:Z:36M1I12M1D21M6D14M1D24M1I19M1D54M1I8M1I6M...,36M1I12M1D21M6D14M1D24M1I19M1D54M1I8M1I6M3D6M1...,False
233691,4782e66e-e8ee-40a3-8b51-5dd078fe9324,05,TTTTTAGCACAGCACAGCACAGCACTTATTGCCATTTGCTGAATGG...,5\t491\t-\tENO2_finalStandard_05Tail\t662\t174...,5,491,-,ENO2_finalStandard_05Tail,662,174,662,447,492,32,tp:A:P,ts:A:.,cg:Z:21M2D54M1I33M1D29M3I2M1I35M3D66M1I36M2D29...,21M2D54M1I33M1D29M3I2M1I35M3D66M1I36M2D29M2I3M...,False


In [20]:
save_path_partial = import_csv.rstrip(".csv")
save_path = f"{get_dt(for_file=True)}_{save_path_partial}.parquet"
print(f"Saving re-merged dataframe to: {save_path}")
new_df.to_parquet(save_path)

Saving re-merged dataframe to: 221205_221109_RNAStds_Nano3P_Assignments.trimmed.parquet


In [11]:
# This code block was what I used before the previous code block to assess perfect barcode matches. It is no longer useful, but might be helpful for future code writing!
"""
for assignment in ['00', '05', '10', '15', '30', '60']:
    # I think something isn't working here? Based on the SampleAlignments file for 00, I should see AT LEAST 14 perfect matches in the 00 list... but this only counts 1!
    # It was b/c the SampleAlignments came out of a tool that reverse complimented them!! so I needed to do that for either the sequences or the standards!! 
    # Damn cDNA!! Fool me once, shame on cDNA. Fool me twice, won't get fooled again.
    print()
    print(assignment)
    print(hit_df[hit_df.assignment == assignment].sequence.str.contains(rc_barcode_dict[assignment], case=False).value_counts(normalize=True))
    # Takeaway for this is that all the standards have 20-45% perfect matches. While this isn't awesome, I think it indicates that the large # of 60 isn't a mapping failure!
hit_df
"""

"\nfor assignment in ['00', '05', '10', '15', '30', '60']:\n    # I think something isn't working here? Based on the SampleAlignments file for 00, I should see AT LEAST 14 perfect matches in the 00 list... but this only counts 1!\n    # It was b/c the SampleAlignments came out of a tool that reverse complimented them!! so I needed to do that for either the sequences or the standards!! \n    # Damn cDNA!! Fool me once, shame on cDNA. Fool me twice, won't get fooled again.\n    print()\n    print(assignment)\n    print(hit_df[hit_df.assignment == assignment].sequence.str.contains(rc_barcode_dict[assignment], case=False).value_counts(normalize=True))\n    # Takeaway for this is that all the standards have 20-45% perfect matches. While this isn't awesome, I think it indicates that the large # of 60 isn't a mapping failure!\nhit_df\n"

In [15]:
# Print some example alignments!!

hit_df[hit_df.assignment == '60'].head(5).apply(lambda row: print_mappy_hit_alignment_for_stds(
    row['Unnamed: 0'],
    row['cleaned_cigar'],
    row['r_st'],
    row['r_en'],
    row['q_st'],
    row['q_en'],
    row['strand'],
    row['sequence'],
    row['assignment'],
    stds_ref_dict,
    line_print_width=175,
),
                                                axis=1)
print("done.")


read_id=ce3360d6-89ac-44fd-8182-00369a35f346; assignment=60

Read: AAGGCCAACCTAGATGTTGTTGACCAAAAGGCCGTCGATGACTTCTTGTTGTTCTTGGATGGTACCGCCAACAAGTCCAAGTT GGTGCTAACGTCATCTTGGGAGGTTCCATGGCC   GCTAGAGCC   GCTGCTGAAAAGAAACGTCCCATTGTACCAACATTTGGCTGAC
      ||||||||||||||||||•••|||||||||||||||||||||||||||||||••||||||||||||||||||||||||||||| ||||||||||••||||||||•|••|||||||||   |||||||||   |||||||||||| ||||||||||||||||||||||||||||||
Ref:  AAGGCCAACCTAGATGTTAAGGACCAAAAGGCCGTCGATGACTTCTTGTTGTCTTTGGATGGTACCGCCAACAAGTCCAAGTTGGGTGCTAACGCTATCTTGGGTGTCTCCATGGCCGCTGCTAGAGCCGCTGCTGCTGAAAAG AACGTCCCATTGTACCAACATTTGGCTGAC

Read: TTGTCTAAGTCCAAGACCT TCCATACGTTTTAAG AAATTCCATTCTTGAACG TTTGAACGGGTGGTTCCCACGCT GCGGTGCTTAAGTTTTGCAAGAATTTATGATTGCTCCAACTGGTGCTAAGACCTTCGCTGAAGCTTTGAGAATTGGTTCCGAAGTTTACCACAACT
      ||||||||||||||||||| ||||||||||||  | •|•||||||||||||||| ||||||| ||||||||||||||| |•|||||||••|•||||||||||||•||||||||||||||||||||||||||||||||||||||••||||||||||||||||||||||||||||||
Ref:  TTGTCTAAGTCCAAGACCTCT

One interesting thing might be looking for perfect matches of barcodes within reads. This would be a way to avoid any partially mixed up reads!

In [12]:
hit_df_groupby = hit_df.groupby("assignment")

grouped_df = hit_df_groupby['assignment'].count().to_frame(name="total_count")
grouped_df['perfect_match_count'] = hit_df_groupby['perfect_barcode'].sum().to_frame(name="perfect_match_count")
grouped_df['mean_r_en'] = hit_df_groupby['r_en'].mean().to_frame(name="mean_r_en")

grouped_df['imperfect_match_count'] = grouped_df.total_count - grouped_df.perfect_match_count
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,assignment,total_count,perfect_match_count,mean_r_en,imperfect_match_count
0,0,4765,3130,655.155509,1635
1,5,27553,13011,657.35546,14542
2,10,45563,35820,657.645831,9743
3,15,51368,31908,658.331043,19460
4,30,395,0,653.997468,395
5,60,63456,48094,657.595531,15362


In [14]:
import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
fig = px.bar(grouped_df, x='assignment',
             y=['perfect_match_count',
                'imperfect_match_count',
                ])
fig.update_layout(height=500, width=700)
fig.update_layout(legend=dict(
    orientation='h',
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
fig.show()

In [17]:
fig = px.box(hit_df.sort_values("assignment")[hit_df.perfect_barcode],
             x='assignment',
             y='r_en',
             range_y=[642,665],
             points="outliers",
             )
fig.update_layout(height=500, width=700)
fig.add_hline(662)
fig.show(renderer="firefox")
fig.write_image(f"{get_dt(for_file=True)}_refEnd_BoxPlot.png")


Boolean Series key will be reindexed to match DataFrame index.



In [204]:
miss_df = df[~df.assignment.isin(['00', '05', '10', '15', '30', '60'])]

In [223]:
new_df = pd.concat([hit_df, miss_df])
new_df.rename(columns={"Unnamed: 0": "read_id"}, inplace=True)
new_df.drop(columns=["mappy_hit_obj"], inplace=True)
new_df.fillna(pd.NA, inplace=True)
print(new_df.head())
print(new_df.tail())

                                read_id assignment                                           sequence q_st q_en strand                        ctg ctg_len r_st r_en mlen blen mapq      tp      ts  \
1  1321805d-cbeb-4725-8122-b8d942775ff0         60  GATACTTCGTTCAGTTACGTATTGCTCTTCCGATCACTTGCCTGTC...  105  324      -  ENO2_finalStandard_60Tail     662  441  662  201  217   60  tp:A:P  ts:A:.   
2  ce3360d6-89ac-44fd-8182-00369a35f346         60  ATTGTACTTCGTTCCATTGTGTGCTCTTCCGATCACTTGCCTGGTG...  121  713      -  ENO2_finalStandard_60Tail     662   29  653  572  617    1  tp:A:P  ts:A:.   
3  cd9bc58b-1f35-4b34-9462-0c1e13410cde         60  AGTATGCTTCGTTCAGTTACGTATTGCTCTTCCGATCGCACGCGCA...  105  722      -  ENO2_finalStandard_60Tail     662   31  662  573  636   36  tp:A:P  ts:A:.   
4  9c62cb37-e358-4e6b-87a6-608324c0d44c         15  AGTACTTCGGTTCAGTTGCCGTACTCTTCCGATCACCTGTCGCTCT...   60  671      -  ENO2_finalStandard_15Tail     662   30  657  578  631    1  tp:A:P  ts:A:.   
5  089403b

In [225]:

print(pd.read_parquet("221107_expanded_221031_RNAStds_Nano3P_Assignments.parquet"))

                                     read_id     assignment                                           sequence   q_st   q_en strand                        ctg ctg_len   r_st   r_en  mlen  blen  \
1       1321805d-cbeb-4725-8122-b8d942775ff0             60  GATACTTCGTTCAGTTACGTATTGCTCTTCCGATCACTTGCCTGTC...  105.0  324.0      -  ENO2_finalStandard_60Tail     662  441.0  662.0   201   217   
2       ce3360d6-89ac-44fd-8182-00369a35f346             60  ATTGTACTTCGTTCCATTGTGTGCTCTTCCGATCACTTGCCTGGTG...  121.0  713.0      -  ENO2_finalStandard_60Tail     662   29.0  653.0   572   617   
3       cd9bc58b-1f35-4b34-9462-0c1e13410cde             60  AGTATGCTTCGTTCAGTTACGTATTGCTCTTCCGATCGCACGCGCA...  105.0  722.0      -  ENO2_finalStandard_60Tail     662   31.0  662.0   573   636   
4       9c62cb37-e358-4e6b-87a6-608324c0d44c             15  AGTACTTCGGTTCAGTTGCCGTACTCTTCCGATCACCTGTCGCTCT...   60.0  671.0      -  ENO2_finalStandard_15Tail     662   30.0  657.0   578   631   
5       089403b0-536

In [None]:
new_df