In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
DATA_PATH = Path(".") / "../../data"
RESULTS_PATH = Path(".") / "../../results"
RESULTS_PATH.mkdir(parents=True, exist_ok=True)

In [3]:
ngs_ground_truth_path = DATA_PATH / "ngs_stratified" / "ngs_sample_human_ground_truth.csv"
regression_results_path = RESULTS_PATH / "ngs_stratified" / "ngs_sample_human_regresion_results.csv"
ngs_input_path = DATA_PATH / "ngs_stratified" / "ngs_sample_human_nt.fasta"


In [4]:
from riot_na.api.api_mp import run_on_file_mp
from riot_na.api.api_mp import GENE_DB_DIR, InputType, Organism

run_on_file_mp(
    db_dir=GENE_DB_DIR,
    input_fasta_path=ngs_input_path,
    result_path=regression_results_path,
    input_type=InputType.NT,
    allowed_species=[Organism.HOMO_SAPIENS],
    extend_alignment=False,
    n_processes=8,
)

100%|██████████| 392857/392857 [05:35<00:00, 1170.32it/s]


In [5]:
from notebooks.utils import base64_decode_series, validation_flags_comparison
import pandas as pd

new_df = pd.read_csv(regression_results_path, index_col=0, engine="pyarrow")
new_df_flags = base64_decode_series(new_df, "additional_validation_flags")
print(len(new_df_flags))

old_df = pd.read_csv(ngs_ground_truth_path, index_col=0, engine="pyarrow") 
old_df_flags = base64_decode_series(old_df, "additional_validation_flags")
print(len(old_df_flags))

validation_flags_comparison(old_df_flags, new_df_flags)

366228
366412


Unnamed: 0,False_old,False_new,False_old-new,True_old,True_new,True_old-new
regions_in_aligned_sequence,0.0,37.0,-37.0,327957.0,327920.0,37.0
regions_aa_in_aligned_sequence_aa,0.0,0.0,0.0,327957.0,327957.0,0.0
translated_regions_in_aligned_sequence_aa,0.0,3.0,-3.0,327957.0,327954.0,3.0
correct_vj_in_frame,0.0,0.0,0.0,327957.0,327957.0,0.0
cdr3_in_junction,0.0,0.0,0.0,327571.0,327606.0,-35.0
locus_as_in_v_gene,0.0,0.0,0.0,327957.0,327957.0,0.0
v_gene_alignment,0.0,0.0,0.0,327957.0,327957.0,0.0
j_gene_alignment,0.0,0.0,0.0,327957.0,327957.0,0.0
c_gene_alignment,0.0,0.0,0.0,321506.0,321505.0,1.0
no_negative_offsets_inside_v_alignment,298.0,300.0,-2.0,327659.0,327657.0,2.0


In [6]:
# Filter to only show rows where additional_validation_flags is not null/empty
new_df_flags_filtered = new_df_flags[
    new_df_flags['additional_validation_flags'].notna() & 
    (new_df_flags['additional_validation_flags'] != {}) &
    (new_df_flags['additional_validation_flags'].apply(lambda x: x.get('regions_in_aligned_sequence', False) == False))
]
new_df_flags_filtered = new_df_flags_filtered.query('productive == True')
new_df_flags_filtered = base64_decode_series(new_df_flags_filtered, "scheme_residue_mapping")
new_df_flags_filtered.head(1).T.style.set_properties(**{'text-align': 'left'})

sequence_header,SRR7771923.535638
sequence,ACCCTGTCCCTCACCTGGGCTGCCTGGTGGCTCCATCACGAGTTACTTCTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGGGTACATCTATCACAGTGGGAGCACCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCTCGAATGAGTTCTCCCTGAAGCTGACGTCTGTGACCGCTGCGGACACGGCCGTGTATTACTGTGCGAGGACAAATAGTATTCGGGCCTATGACTTCTGGGGCCGGGGAAGTCTGGTCATCGTCTCCGCAGCCTCCACCAAGGGCCCATCCGTCTTCCCCCTGGCGC
query_sequence,ACCCTGTCCCTCACCTGGGCTGCCTGGTGGCTCCATCACGAGTTACTTCTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGGGTACATCTATCACAGTGGGAGCACCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCTCGAATGAGTTCTCCCTGAAGCTGACGTCTGTGACCGCTGCGGACACGGCCGTGTATTACTGTGCGAGGACAAATAGTATTCGGGCCTATGACTTCTGGGGCCGGGGAAGTCTGGTCATCGTCTCCGCAGCCTCCACCAAGGGCCCATCCGTCTTCCCCCTGGCGC
numbering_scheme,imgt
locus,igh
stop_codon,False
vj_in_frame,True
v_frameshift,True
j_frameshift,False
productive,True
rev_comp,False


In [1]:
from riot_na import create_riot_nt, Organism, Scheme
from riot_na.config import GENE_DB_DIR

riot_na = create_riot_nt(allowed_species=[Organism.HOMO_SAPIENS], db_dir=GENE_DB_DIR)
airr_heavy = riot_na.run_on_sequence("", 
                                 "CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGTCCTCGGTGAAGGTCTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAGCTATGCTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGAGGGATCATCCCTATCTTTGGTACAGCAAACTACGCACAGAAGTTCCAGGGCAGAGTCACGATTACCGCGGACGAATCCACGAGCACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAGGACACGGCCGTGTATTACTGTGCGAGAGTCGGGGAGAGGGTAGTGGGAGCTACTTTCCTTGGGGGGTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCGGGGAGTGCATCCGCCCCAACCCTTT", 
                                 scheme=Scheme.IMGT, return_all_domains=True)

for res in airr_heavy:
    print(res.__dict__)
    print(res.additional_validation_flags)

{'sequence_header': '', 'sequence': 'CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGTCCTCGGTGAAGGTCTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAGCTATGCTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGAGGGATCATCCCTATCTTTGGTACAGCAAACTACGCACAGAAGTTCCAGGGCAGAGTCACGATTACCGCGGACGAATCCACGAGCACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAGGACACGGCCGTGTATTACTGTGCGAGAGTCGGGGAGAGGGTAGTGGGAGCTACTTTCCTTGGGGGGTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCGGGGAGTGCATCCGCCCCAACCCTTT', 'query_sequence': 'CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGTCCTCGGTGAAGGTCTCCTGCAAGGCTTCTGGAGGCACCTTCAGCAGCTATGCTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGAGGGATCATCCCTATCTTTGGTACAGCAAACTACGCACAGAAGTTCCAGGGCAGAGTCACGATTACCGCGGACGAATCCACGAGCACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAGGACACGGCCGTGTATTACTGTGCGAGAGTCGGGGAGAGGGTAGTGGGAGCTACTTTCCTTGGGGGGTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCGGGGAGTGCATCCGCCCCAACCCTTT', 'numbering_scheme': 'imgt', 'locus': 'igh', 'stop_codon': False, 'vj_in_frame': True, 'v_frameshift': False,

In [9]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

(new_df_flags_filtered.drop(columns=['positional_scheme_mapping', 'scheme_residue_mapping']).head(1).T
 .style.set_properties(**{'text-align': 'left'})
 )

sequence_header,SRR7771923.535638
sequence,ACCCTGTCCCTCACCTGGGCTGCCTGGTGGCTCCATCACGAGTTACTTCTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGGGTACATCTATCACAGTGGGAGCACCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCTCGAATGAGTTCTCCCTGAAGCTGACGTCTGTGACCGCTGCGGACACGGCCGTGTATTACTGTGCGAGGACAAATAGTATTCGGGCCTATGACTTCTGGGGCCGGGGAAGTCTGGTCATCGTCTCCGCAGCCTCCACCAAGGGCCCATCCGTCTTCCCCCTGGCGC
query_sequence,ACCCTGTCCCTCACCTGGGCTGCCTGGTGGCTCCATCACGAGTTACTTCTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGGGTACATCTATCACAGTGGGAGCACCGACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCTCGAATGAGTTCTCCCTGAAGCTGACGTCTGTGACCGCTGCGGACACGGCCGTGTATTACTGTGCGAGGACAAATAGTATTCGGGCCTATGACTTCTGGGGCCGGGGAAGTCTGGTCATCGTCTCCGCAGCCTCCACCAAGGGCCCATCCGTCTTCCCCCTGGCGC
numbering_scheme,imgt
locus,igh
stop_codon,False
vj_in_frame,True
v_frameshift,True
j_frameshift,False
productive,True
rev_comp,False


In [10]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

new_df_productive = new_df.query('productive == True and complete_vdj == True')
old_df_productive = old_df.query('productive == True and complete_vdj == True')
comparison_df = new_df_productive[["v_call"]].join(old_df_productive[["v_call"]], how="left", lsuffix="_new", rsuffix="_old")
comparison_df['same_v_call'] = comparison_df['v_call_new'] == comparison_df['v_call_old']
comparison_df.query("same_v_call == False")

Unnamed: 0_level_0,v_call_new,v_call_old,same_v_call
sequence_header,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SRR13385912.4330924,IGHV1-69*01,,False
SRR7771916.41045,IGHV3-53*01,,False
SRR7771938.210814,IGHV3-64D*08,,False
SRR7771943.600713,IGHV3-9*04,,False
SRR13385916.5551088,IGHV4-34*01,,False
...,...,...,...
SRR2925334.34745,IGHV6-1*01,,False
SRR5471276.93555,IGLV1-44*01,,False
SRR5471269.39356,IGKV1-12*01,,False
SRR3990900.144867,IGHV3-74*01,,False
