In [1]:
%load_ext autoreload
%autoreload 2

In [71]:
import pandas as pd
from pathlib import Path
from notebooks.utils import base64_decode_series, validation_flags_comparison
from riot_na.data.model import Scheme
from IPython.display import Markdown
import json

In [69]:
MMSEQS_RESULTS_PATH = Path("/home/bartosz/Documents/analyzer/projects/automation/tasks/numbering/databases/data/ngs_sample_stratified_clean_result/result.parquet")

usecols = ["sequence_header", "v_call", "productive", "additional_validation_flags"]
mmseqs_df = pd.read_parquet(MMSEQS_RESULTS_PATH, columns=usecols).set_index("sequence_header")
mmseqs_df["additional_validation_flags"] = mmseqs_df["additional_validation_flags"].apply(json.loads)

In [72]:
RESULTS_DIR = Path().absolute().parent.parent / "results" / "mouse_database_nt"

for scheme in Scheme:
    path = RESULTS_DIR / f"ngs_sample_clean_{scheme.value}_human.csv"
    df = pd.read_csv(
        path,
        index_col=0,
        engine="pyarrow",
        usecols=usecols,
    )
    df = base64_decode_series(df, "additional_validation_flags")
    display(Markdown(f"## {scheme.upper()}"))
    display(validation_flags_comparison(mmseqs_df, df, df_1_name="mmseqs", df_2_name="riot" ))
    display(Markdown("---"))

## KABAT

Unnamed: 0,False_mmseqs,False_riot,False_mmseqs-riot,True_mmseqs,True_riot,True_mmseqs-riot
regions_in_aligned_sequence,4091.0,,,387307.0,391398.0,-4091.0
regions_aa_in_aligned_sequence_aa,3988.0,,,387410.0,391398.0,-3988.0
translated_regions_in_aligned_sequence_aa,6517.0,,,384881.0,391398.0,-6517.0
correct_vj_in_frame,,,,391398.0,391398.0,0.0
cdr3_in_junction,,,,386691.0,391036.0,-4345.0
locus_as_in_v_gene,,,,391398.0,391398.0,0.0
v_gene_alignment,,,,391398.0,391398.0,0.0
j_gene_alignment,,,,391398.0,391398.0,0.0
c_gene_alignment,,,,261762.0,381639.0,-119877.0
no_negative_offsets_inside_v_alignment,,55.0,,391398.0,391343.0,55.0


---

## CHOTHIA

Unnamed: 0,False_mmseqs,False_riot,False_mmseqs-riot,True_mmseqs,True_riot,True_mmseqs-riot
regions_in_aligned_sequence,4091.0,,,387307.0,391398.0,-4091.0
regions_aa_in_aligned_sequence_aa,3988.0,,,387410.0,391398.0,-3988.0
translated_regions_in_aligned_sequence_aa,6517.0,,,384881.0,391398.0,-6517.0
correct_vj_in_frame,,,,391398.0,391398.0,0.0
cdr3_in_junction,,,,386691.0,390984.0,-4293.0
locus_as_in_v_gene,,,,391398.0,391398.0,0.0
v_gene_alignment,,,,391398.0,391398.0,0.0
j_gene_alignment,,,,391398.0,391398.0,0.0
c_gene_alignment,,,,261762.0,381639.0,-119877.0
no_negative_offsets_inside_v_alignment,,52.0,,391398.0,391346.0,52.0


---

## IMGT

Unnamed: 0,False_mmseqs,False_riot,False_mmseqs-riot,True_mmseqs,True_riot,True_mmseqs-riot
regions_in_aligned_sequence,4091.0,,,387307.0,391398.0,-4091.0
regions_aa_in_aligned_sequence_aa,3988.0,,,387410.0,391398.0,-3988.0
translated_regions_in_aligned_sequence_aa,6517.0,,,384881.0,391398.0,-6517.0
correct_vj_in_frame,,,,391398.0,391398.0,0.0
cdr3_in_junction,,,,386691.0,391323.0,-4632.0
locus_as_in_v_gene,,,,391398.0,391398.0,0.0
v_gene_alignment,,,,391398.0,391398.0,0.0
j_gene_alignment,,,,391398.0,391398.0,0.0
c_gene_alignment,,,,261762.0,381639.0,-119877.0
no_negative_offsets_inside_v_alignment,,47.0,,391398.0,391351.0,47.0


---

## MARTIN

Unnamed: 0,False_mmseqs,False_riot,False_mmseqs-riot,True_mmseqs,True_riot,True_mmseqs-riot
regions_in_aligned_sequence,4091.0,,,387307.0,391398.0,-4091.0
regions_aa_in_aligned_sequence_aa,3988.0,,,387410.0,391398.0,-3988.0
translated_regions_in_aligned_sequence_aa,6517.0,,,384881.0,391398.0,-6517.0
correct_vj_in_frame,,,,391398.0,391398.0,0.0
cdr3_in_junction,,,,386691.0,390984.0,-4293.0
locus_as_in_v_gene,,,,391398.0,391398.0,0.0
v_gene_alignment,,,,391398.0,391398.0,0.0
j_gene_alignment,,,,391398.0,391398.0,0.0
c_gene_alignment,,,,261762.0,381639.0,-119877.0
no_negative_offsets_inside_v_alignment,,52.0,,391398.0,391346.0,52.0


---

# IMGT

In [4]:
scheme = "imgt"
riot_results_path = RIOT_RESULTS_DIR / f"ngs_sample_clean_{scheme}.csv"
riot_df = pd.read_csv(riot_results_path, index_col=0, usecols=usecols, engine="pyarrow")
riot_df = base64_decode_series(riot_df, ["additional_validation_flags"])
validation_flags_comparison(mmseqs_df, riot_df)

Unnamed: 0,False_mmseqs,False_riot,False_mmseqs-riot,True_mmseqs,True_riot,True_mmseqs-riot
regions_in_aligned_sequence,4094.0,,,387524.0,391618.0,-4094.0
regions_aa_in_aligned_sequence_aa,3990.0,,,387628.0,391618.0,-3990.0
translated_regions_in_aligned_sequence_aa,6523.0,,,385095.0,391618.0,-6523.0
correct_vj_in_frame,,,,391618.0,391618.0,0.0
cdr3_in_junction,,,,386905.0,391542.0,-4637.0
locus_as_in_v_gene,,,,391618.0,391618.0,0.0
v_gene_alignment,,,,391618.0,391618.0,0.0
j_gene_alignment,,,,391618.0,391618.0,0.0
c_gene_alignment,,,,261806.0,359001.0,-97195.0
no_negative_offsets_inside_v_alignment,,39.0,,391618.0,391579.0,39.0


# Kabat

In [5]:
scheme = "kabat"
riot_results_path = RIOT_RESULTS_DIR / f"ngs_sample_clean_{scheme}.csv"
riot_df = pd.read_csv(riot_results_path, index_col=0, usecols=usecols, engine="pyarrow")
riot_df = base64_decode_series(riot_df, ["additional_validation_flags"])
validation_flags_comparison(mmseqs_df, riot_df)

Unnamed: 0,False_mmseqs,False_riot,False_mmseqs-riot,True_mmseqs,True_riot,True_mmseqs-riot
regions_in_aligned_sequence,4094.0,,,387524.0,391618.0,-4094.0
regions_aa_in_aligned_sequence_aa,3990.0,,,387628.0,391618.0,-3990.0
translated_regions_in_aligned_sequence_aa,6523.0,,,385095.0,391618.0,-6523.0
correct_vj_in_frame,,,,391618.0,391618.0,0.0
cdr3_in_junction,,,,386905.0,391265.0,-4360.0
locus_as_in_v_gene,,,,391618.0,391618.0,0.0
v_gene_alignment,,,,391618.0,391618.0,0.0
j_gene_alignment,,,,391618.0,391618.0,0.0
c_gene_alignment,,,,261806.0,359001.0,-97195.0
no_negative_offsets_inside_v_alignment,,36.0,,391618.0,391582.0,36.0


# Chothia

In [6]:
scheme = "chothia"
riot_results_path = RIOT_RESULTS_DIR / f"ngs_sample_clean_{scheme}.csv"
riot_df = pd.read_csv(riot_results_path, index_col=0, usecols=usecols, engine="pyarrow")
riot_df = base64_decode_series(riot_df, ["additional_validation_flags"])
validation_flags_comparison(mmseqs_df, riot_df)

Unnamed: 0,False_mmseqs,False_riot,False_mmseqs-riot,True_mmseqs,True_riot,True_mmseqs-riot
regions_in_aligned_sequence,4094.0,,,387524.0,391618.0,-4094.0
regions_aa_in_aligned_sequence_aa,3990.0,,,387628.0,391618.0,-3990.0
translated_regions_in_aligned_sequence_aa,6523.0,,,385095.0,391618.0,-6523.0
correct_vj_in_frame,,,,391618.0,391618.0,0.0
cdr3_in_junction,,,,386905.0,391213.0,-4308.0
locus_as_in_v_gene,,,,391618.0,391618.0,0.0
v_gene_alignment,,,,391618.0,391618.0,0.0
j_gene_alignment,,,,391618.0,391618.0,0.0
c_gene_alignment,,,,261806.0,359001.0,-97195.0
no_negative_offsets_inside_v_alignment,,52.0,,391618.0,391566.0,52.0


# Martin

In [7]:
scheme = "martin"
riot_results_path = RIOT_RESULTS_DIR / f"ngs_sample_clean_{scheme}.csv"
riot_df = pd.read_csv(riot_results_path, index_col=0, usecols=usecols, engine="pyarrow")
riot_df = base64_decode_series(riot_df, ["additional_validation_flags"])
validation_flags_comparison(mmseqs_df, riot_df)

Unnamed: 0,False_mmseqs,False_riot,False_mmseqs-riot,True_mmseqs,True_riot,True_mmseqs-riot
regions_in_aligned_sequence,4094.0,,,387524.0,391618.0,-4094.0
regions_aa_in_aligned_sequence_aa,3990.0,,,387628.0,391618.0,-3990.0
translated_regions_in_aligned_sequence_aa,6523.0,,,385095.0,391618.0,-6523.0
correct_vj_in_frame,,,,391618.0,391618.0,0.0
cdr3_in_junction,,,,386905.0,391213.0,-4308.0
locus_as_in_v_gene,,,,391618.0,391618.0,0.0
v_gene_alignment,,,,391618.0,391618.0,0.0
j_gene_alignment,,,,391618.0,391618.0,0.0
c_gene_alignment,,,,261806.0,359001.0,-97195.0
no_negative_offsets_inside_v_alignment,,52.0,,391618.0,391566.0,52.0
