In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
DATA_PATH = Path(".") / "../../data"
RESULTS_PATH = Path(".") / "../../results"
RESULTS_PATH.mkdir(parents=True, exist_ok=True)

In [3]:
ngs_ground_truth_path = DATA_PATH / "ngs_stratified" / "ngs_sample_human_ground_truth_2_2_2.csv"
regression_results_path = RESULTS_PATH / "ngs_stratified" / "ngs_sample_human_regresion_results.csv"
ngs_input_path = DATA_PATH / "ngs_stratified" / "ngs_sample_human_nt.fasta"


In [4]:
from riot_na.api.api_mp import run_on_file_mp
from riot_na.api.api_mp import GENE_DB_DIR, InputType, Organism

run_on_file_mp(
    db_dir=GENE_DB_DIR,
    input_fasta_path=ngs_input_path,
    result_path=regression_results_path,
    input_type=InputType.NT,
    allowed_species=[Organism.HOMO_SAPIENS],
    extend_alignment=False,
    n_processes=8,
)

100%|██████████| 392857/392857 [05:31<00:00, 1186.82it/s]


In [5]:
from notebooks.utils import base64_decode_series, validation_flags_comparison
import pandas as pd

new_df = pd.read_csv(regression_results_path, index_col=0, engine="pyarrow")
new_df_flags = base64_decode_series(new_df, "additional_validation_flags")
print(len(new_df_flags))

old_df = pd.read_csv(ngs_ground_truth_path, index_col=0, engine="pyarrow") 
old_df_flags = base64_decode_series(old_df, "additional_validation_flags")
print(len(old_df_flags))

validation_flags_comparison(old_df_flags, new_df_flags)

364564
366284


Unnamed: 0,False_old,False_new,False_old-new,True_old,True_new,True_old-new
regions_in_aligned_sequence,0.0,57.0,-57.0,328601.0,328544.0,57.0
regions_aa_in_aligned_sequence_aa,0.0,0.0,0.0,328504.0,328522.0,-18.0
translated_regions_in_aligned_sequence_aa,12594.0,143.0,12451.0,316007.0,328458.0,-12451.0
correct_vj_in_frame,0.0,0.0,0.0,328601.0,328601.0,0.0
cdr3_in_junction,0.0,0.0,0.0,327251.0,327653.0,-402.0
locus_as_in_v_gene,0.0,0.0,0.0,328601.0,328601.0,0.0
v_gene_alignment,0.0,0.0,0.0,328601.0,328601.0,0.0
j_gene_alignment,0.0,0.0,0.0,328601.0,328601.0,0.0
c_gene_alignment,0.0,0.0,0.0,322155.0,322155.0,0.0
no_negative_offsets_inside_v_alignment,413.0,560.0,-147.0,328188.0,328041.0,147.0


In [6]:
# Merge dataframes
merged_df = old_df.merge(new_df, on='sequence_header', how='left', suffixes=('_x', '_y'))
merged_df = merged_df.fillna(False)[(merged_df['complete_vdj_x'] == True) & (merged_df['productive_x'] == True)]

# Define columns to exclude from comparison
exclude_cols = {"sequence_header", "additional_validation_flags", "exc", "c_alignment_start", "c_alignment_end"}

# Get columns to compare (vectorized approach)
compare_cols = [col for col in old_df.columns if col not in exclude_cols]

if compare_cols:
    # Vectorized comparison - do all columns at once
    x_cols = [f"{col}_x" for col in compare_cols]
    y_cols = [f"{col}_y" for col in compare_cols]
    
    # Create comparison dataframe in one operation
    comparison_data = merged_df[x_cols].values == merged_df[y_cols].values
    comparison_df = pd.DataFrame(
        comparison_data,
        columns=[f"{col}_comparison" for col in compare_cols],
        index=merged_df.index
    )
    
    # Concatenate once instead of multiple times
    merged_df = pd.concat([merged_df, comparison_df], axis=1)

# Get comparison columns and compute counts
comparison_columns = [f"{col}_comparison" for col in compare_cols]
if comparison_columns:
    # More efficient counting using value_counts with normalize=False
    comparison_counts = pd.DataFrame({
        col: merged_df[col].value_counts().reindex([False, True], fill_value=0)
        for col in comparison_columns
    }).T
    
    # Display results
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
    display(comparison_counts)

  merged_df = merged_df.fillna(False)[(merged_df['complete_vdj_x'] == True) & (merged_df['productive_x'] == True)]


Unnamed: 0,False,True
sequence_comparison,153272,171777
numbering_scheme_comparison,0,325049
locus_comparison,210,324839
stop_codon_comparison,490,324559
vj_in_frame_comparison,211,324838
v_frameshift_comparison,22,325027
j_frameshift_comparison,4,325045
productive_comparison,701,324348
rev_comp_comparison,92,324957
complete_vdj_comparison,229,324820


Significant differences explanation:
* sequence, sequence_aa - sequence is truncated to the segment ranges, input sequence is in the query_sequence field 
* v_support - although v_score is not different it depeneds on the len(sequence), so after truncation to the segment ranges it has a different value of len(query) variable so the result changes
* regions _start, _end - regions ranges are related to the sequence, so it's moved by segment_start related to the query_sequence
