In [None]:
# import ogrdb json file 
# convert to fasta
# translate to amino acids
# align to existing genes database
  

# riot poc


In [105]:
# import genes from OGRDB
# j genes for light chains are shared between strains
igkj_all = "https://ogrdb.airr-community.org/download_germline_set/Mus%20musculus/IGKJ%20(all%20strains)/published/ungapped"
iglj_all = "https://ogrdb.airr-community.org/download_germline_set/Mus%20musculus/IGLJ%20(all%20strains)/published/ungapped"

# C57BL/6(J)
# IGH file contains sequences for V, D and J alleles
# to split sequences by locus, import airr file
igh_airr = "https://ogrdb.airr-community.org/download_germline_set/Mus%20musculus/C57BL%25252f6/C57BL%25252f6%20IGH/published/airr"
igkv = "https://ogrdb.airr-community.org/download_germline_set/Mus%20musculus/C57BL%25252f6J/C57BL%25252f6J%20IGKV/published/ungapped"
iglv = "https://ogrdb.airr-community.org/download_germline_set/Mus%20musculus/C57BL%25252f6J/C57BL%25252f6J%20IGLV/published/ungapped"

# c_genes =""


In [106]:
import requests
import os

data_path = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/raw/"

os.makedirs(data_path, exist_ok=True)

# download files
def download_file(url, path):
    r = requests.get(url)
    with open(path, 'wb') as f:
        f.write(r.content)

download_file(igkj_all, data_path + "igkj.fasta")
download_file(iglj_all, data_path + "iglj.fasta")

download_file(igh_airr, data_path + "igh_airr.json")
download_file(igkv, data_path + "igkv.fasta")
download_file(iglv, data_path + "iglv.fasta")

In [15]:
# first we want to split the sequences 
# gene_db/v_genes/organism.fasta
# gene_db/d_genes/organism/igh.fasta
# gene_db/j_genes/organism/igh.fasta
# gene_db/j_genes/organism/igk.fasta
# gene_db/j_genes/organism/igl.fasta

# first we copy V genes
# v gene header has the following format: >{allele_name}    {locus} {reading_frame} {species}
# we need to specify organism as custom
# reading frame is always 0 in this case
# >IGHV4-28*07	IGH	0	HOMO_SAPIENS

# IGHV alleles
# read file to dict (one big nested json)
import json
from Bio import SeqIO

igh_data = {}
with open(data_path + "igh_airr.json", 'r') as f:
    igh_data = json.load(f)

database_path = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/"
v_genes_path = os.path.join(database_path, "v_genes")
os.makedirs(v_genes_path, exist_ok=True)

with open(os.path.join(v_genes_path, "custom.fasta"), "w") as v_genes_file:
    for allele_data in igh_data["GermlineSet"][0]["allele_descriptions"]:
        name = allele_data["label"]
        sequence = allele_data["coding_sequence"]
        sequence_type = allele_data["sequence_type"]

        if sequence_type == "V":
            v_genes_file.write(f">{name}\tIGH\t0\tCUSTOM\n{sequence}\n")


    # IGKV and IGLV should be just rewritten
    igkv_records = SeqIO.parse(data_path + "igkv.fasta", "fasta")
    for record in igkv_records:
        v_genes_file.write(f">{record.id}\tIGK\t0\tCUSTOM\n{record.seq}\n")

    iglv_records = SeqIO.parse(data_path + "iglv.fasta", "fasta")
    for record in iglv_records:
        v_genes_file.write(f">{record.id}\tIGL\t0\tCUSTOM\n{record.seq}\n")




In [57]:
from Bio.Seq import Seq
import os
import json
from Bio import SeqIO

igh_data = {}
data_path = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/raw/"


with open(data_path + "igh_airr.json", 'r') as f:
    igh_data = json.load(f)

In [82]:



def translate(query_sequence: str, coding_frame: int) -> str:
    assert coding_frame in [0, 1, 2]

    query_sequence = query_sequence[coding_frame:]
    partial_codon_len = len(query_sequence) % 3
    if partial_codon_len:
        query_sequence = query_sequence[:-partial_codon_len]  # To avoid BioPython "Partial codon" warning.
    coding_dna = Seq(query_sequence)
    return str(coding_dna.translate(gap="."))



In [10]:


aa_database_path = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/aa_genes"
aa_v_genes_path = os.path.join(aa_database_path, "v_genes")
os.makedirs(aa_v_genes_path, exist_ok=True)

with open(os.path.join(aa_v_genes_path, "custom.fasta"), "w") as v_genes_file:
    for allele_data in igh_data["GermlineSet"][0]["allele_descriptions"]:
        name = allele_data["label"]
        sequence = allele_data["coding_sequence"]
        translated_sequence = translate(sequence, 0)
        sequence_type = allele_data["sequence_type"]

        if sequence_type == "V":
            v_genes_file.write(f">{name}\tIGH\tCUSTOM\n{translated_sequence}\n")


    # IGKV and IGLV should be just rewritten
    igkv_records = SeqIO.parse(data_path + "igkv.fasta", "fasta")
    for record in igkv_records:
        translated_sequence = translate(str(record.seq), 0)
        v_genes_file.write(f">{record.id}\tIGK\tCUSTOM\n{translated_sequence}\n")

    iglv_records = SeqIO.parse(data_path + "iglv.fasta", "fasta")
    for record in iglv_records:
        translated_sequence = translate(str(record.seq), 0)
        v_genes_file.write(f">{record.id}\tIGL\tCUSTOM\n{translated_sequence}\n")




In [1]:
# infer scheme mappings for v genes
# lets start with alignment of aa sequence to the existing database

from riot_na.data.model import Organism
from riot_na.config import GENE_DB_DIR

from riot_na.alignment.gene_aligner import create_aa_v_gene_aligner


allowed_species = [Organism.HOMO_SAPIENS, Organism.MUS_MUSCULUS]

aa_genes_dir = GENE_DB_DIR / "gene_db" / "aa_genes_deduplicated"
v_aligner = create_aa_v_gene_aligner(allowed_species=allowed_species, aa_genes_dir=aa_genes_dir)


In [2]:
sample_v_gene_aa = "QVTLKESGPGILQPSQTLSLTCSFSGFSLSTSNMGIGWIRQPSGKGLEWLAHIWWNDDKYYNPSLKSRLTISKDTSNNQVFLKITSVDTADTATYYCAQI"
v_aln = v_aligner.align(sample_v_gene_aa)
print(v_aln)

AlignmentEntryAA(target_id='IGHV-4TP6', alignment_score=207.94997184789176, seq_identity=1.0, e_value=4.334877459455786e-154, q_start=0, q_end=100, t_start=0, t_end=100, cigar='100M', species=<Organism.MUS_MUSCULUS: 'mouse'>, locus=<Locus.IGH: 'igh'>, q_seq='QVTLKESGPGILQPSQTLSLTCSFSGFSLSTSNMGIGWIRQPSGKGLEWLAHIWWNDDKYYNPSLKSRLTISKDTSNNQVFLKITSVDTADTATYYCAQI', t_seq='QVTLKESGPGILQPSQTLSLTCSFSGFSLSTSNMGIGWIRQPSGKGLEWLAHIWWNDDKYYNPSLKSRLTISKDTSNNQVFLKITSVDTADTATYYCAQI')


In [3]:
from riot_na.data.scheme_mapping_facade import SchemeMappingFacade
from riot_na.data.model import Scheme

v_scheme_mapping_facade = SchemeMappingFacade(Scheme.IMGT, allowed_species, GENE_DB_DIR)
v_target_scheme_mapping = v_scheme_mapping_facade.get_mapping(v_aln.species, v_aln.target_id)
v_target_scheme_mapping

'MMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMMMMMMMMMMMDDDMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM'

In [34]:
# merge alignments
from riot_na.alignment.alignment_utils import unfold_cigar
from riot_na.schemes.scheme_alignment import force_n_terminus_matches, force_c_terminus_matches
from riot_na.schemes.collapse_alignment import collapse_alignment_str
from riot_na.schemes.scheme_alignment import force_n_terminus_del_ins

from riot_na.schemes.scheme_alignment import _merge_cigars


fixed_v_aln = force_n_terminus_matches(v_aln)

v_query_gene_alignment_str = unfold_cigar(fixed_v_aln.cigar)
v_query_gene_alignment_str = collapse_alignment_str(v_query_gene_alignment_str)
v_query_gene_alignment_str = force_n_terminus_del_ins(fixed_v_aln, v_query_gene_alignment_str)

v_query_scheme_alignment_str = _merge_cigars(v_query_gene_alignment_str, v_target_scheme_mapping)
v_query_scheme_alignment_str

# verify the alignment spans the whole query sequence, otherwise raise an exception
assert v_query_scheme_alignment_str.count("M") + v_query_scheme_alignment_str.count("I") == len(sample_v_gene_aa), "Inferred alignment does not span the whole query sequence"


In [51]:

def infer_v_gene_scheme_mapping(aa_sequence, scheme: Scheme):
    allowed_species = [Organism.HOMO_SAPIENS, Organism.MUS_MUSCULUS]

    aa_genes_dir = GENE_DB_DIR / "gene_db" / "aa_genes_deduplicated"
    v_aligner = create_aa_v_gene_aligner(allowed_species=allowed_species, aa_genes_dir=aa_genes_dir)

    prefiltering_result = v_aligner._prefilter(aa_sequence)
    alignments = v_aligner._align_sequences(aa_sequence, prefiltering_result)

    if not alignments:
        print(f"Could not align sequence: {aa_sequence}")
        print(f"Could not find any alignments")
        return None

    # in this case we want to find the longest alignment
    # therefore we should sort the alignments by the length of the alignment (DESC), then e-value (ASC)
    # find longest alignment: 

    v_aln = None
    longest_aln = None

    for aln in alignments:
        fixed_aln = force_n_terminus_matches(aln)
        fixed_aln = force_c_terminus_matches(aa_sequence, aln.t_seq, fixed_aln)

        aln_len = fixed_aln.q_end - fixed_aln.q_start
        if longest_aln is None or aln_len > longest_aln:
            longest_aln = aln_len
            v_aln = aln
        elif longest_aln is None or aln_len > longest_aln:
            if v_aln is None or aln.e_value < v_aln.e_value:
                v_aln = aln
                
    if longest_aln < len(aa_sequence):
        print(f"Could not align sequence: {aa_sequence}")
        print("Could not find alignment covering query sequence")

   
    v_scheme_mapping_facade = SchemeMappingFacade(scheme, allowed_species, GENE_DB_DIR)
    v_target_scheme_mapping = v_scheme_mapping_facade.get_mapping(v_aln.species, v_aln.target_id)

    fixed_aln = force_n_terminus_matches(v_aln)
    fixed_aln = force_c_terminus_matches(aa_sequence, v_aln.t_seq, fixed_aln)

    v_query_gene_alignment_str = unfold_cigar(fixed_aln.cigar)
    v_query_gene_alignment_str = collapse_alignment_str(v_query_gene_alignment_str)
    v_query_gene_alignment_str = force_n_terminus_del_ins(fixed_aln, v_query_gene_alignment_str)
    v_query_scheme_alignment_str = _merge_cigars(v_query_gene_alignment_str, v_target_scheme_mapping)

    # verify the alignment spans the whole query sequence, otherwise raise an exception
    # assert v_query_scheme_alignment_str.count("M") + v_query_scheme_alignment_str.count("I") == len(aa_sequence), "Inferred alignment does not span the whole query sequence"

    return v_query_scheme_alignment_str


In [54]:
import os 

from Bio import SeqIO


aa_database_path = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/aa_genes"
aa_v_genes_path = os.path.join(aa_database_path, "v_genes")

v_gene_aa_file = os.path.join(aa_v_genes_path, "custom.fasta")


for scheme in Scheme:

    print(f"Processing scheme: {scheme.value}")
    scheme_mapping_path = f"/home/pawel/workspace/riot_na/notebooks/data_processing/data/scheme_mappings/custom/{scheme.value}/"
    os.makedirs(scheme_mapping_path, exist_ok=True)

    scheme_mapping_file = f"/home/pawel/workspace/riot_na/notebooks/data_processing/data/scheme_mappings/custom/{scheme.value}/scheme_mapping.csv"

    # write csv header gene_id,scheme_cigar
    with open(scheme_mapping_file, "w") as scheme_mapping_file:
        scheme_mapping_file.write("gene_id,scheme_cigar\n")
        
        v_gene_aa_records = SeqIO.parse(v_gene_aa_file, "fasta")

        for record in v_gene_aa_records:
            aa_sequence = str(record.seq)
            v_query_scheme_alignment_str = infer_v_gene_scheme_mapping(aa_sequence, scheme)

            if v_query_scheme_alignment_str is not None:
                scheme_mapping_file.write(f"{record.id},{v_query_scheme_alignment_str}\n")

        # save to file


Processing scheme: kabat
Could not align sequence: EVQLQQSGPELVKPGDSVKISCKASGYSFTGYFMNWVMXEPWKEP*VDWTY*SLQW*YFLQPEVQGQGHIDCRQIL*HSPHGAPEPDI*GLCSLLLCK
Could not find any alignments
Could not align sequence: QLVLTQSSSASFSLGASAKLTCTLSSQHSTYTIEWYQQQPLKPPKYVMELKKDGSHSTGDGIPDRFSGSSSGADRYLSISNIQPEDEAIYICGVGDTIKEQFV*
Could not find alignment covering query sequence
Processing scheme: chothia
Could not align sequence: EVQLQQSGPELVKPGDSVKISCKASGYSFTGYFMNWVMXEPWKEP*VDWTY*SLQW*YFLQPEVQGQGHIDCRQIL*HSPHGAPEPDI*GLCSLLLCK
Could not find any alignments
Could not align sequence: QLVLTQSSSASFSLGASAKLTCTLSSQHSTYTIEWYQQQPLKPPKYVMELKKDGSHSTGDGIPDRFSGSSSGADRYLSISNIQPEDEAIYICGVGDTIKEQFV*
Could not find alignment covering query sequence
Processing scheme: imgt
Could not align sequence: EVQLQQSGPELVKPGDSVKISCKASGYSFTGYFMNWVMXEPWKEP*VDWTY*SLQW*YFLQPEVQGQGHIDCRQIL*HSPHGAPEPDI*GLCSLLLCK
Could not find any alignments
Could not align sequence: QLVLTQSSSASFSLGASAKLTCTLSSQHSTYTIEWYQQQPLKPPKYVMELKKDGSHSTGDGIPDRFSGSSSGA

In [47]:
# infer_v_gene_scheme_mapping("*VQLQQSGPELVKPGASVKMSCKASGYTFTDYYMHWVKQKPGKGLEWIGEIYPGSGNTYYNEKFKGKATLTADTSSSTAYMQLSSLTSEDSAVYFCAR", Scheme.IMGT)
# infer_v_gene_scheme_mapping("*VQLQQSGPELVKPGASVKMSCKASGYTFTDYYMHWVKQKPGKGLEWIGEIYPGSGNTYYNEKFKGKATLTADTSSSTAYMQLSSLTSEDSAVYFCAR", Scheme.IMGT)
infer_v_gene_scheme_mapping("QLVLTQSSSASFSLGASAKLTCTLSSQHSTYTIEWYQQQPLKPPKYVMELKKDGSHSTGDGIPDRFSGSSSGADRYLSISNIQPEDEAIYICGVGDTIKEQFV*", Scheme.IMGT)



Could not align sequence: QLVLTQSSSASFSLGASAKLTCTLSSQHSTYTIEWYQQQPLKPPKYVMELKKDGSHSTGDGIPDRFSGSSSGADRYLSISNIQPEDEAIYICGVGDTIKEQFV*
Could not find alignment covering query sequence


In [58]:
# D genes should be just rewritten
database_path = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/"
d_genes_path = os.path.join(database_path, "d_genes", "custom")

os.makedirs(d_genes_path, exist_ok=True)
with open(os.path.join(d_genes_path, "igh.fasta"), "w") as v_genes_file:
    for allele_data in igh_data["GermlineSet"][0]["allele_descriptions"]:
        name = allele_data["label"]
        sequence = allele_data["coding_sequence"]
        sequence_type = allele_data["sequence_type"]

        if sequence_type == "D":
            # nt d alleles: >IGHD2-2*02	HOMO_SAPIENS
            v_genes_file.write(f">{name}\tCUSTOM\n{sequence}\n")




In [103]:
# J genes

from riot_na.data.model import Locus
from riot_na.alignment.gene_aligner import create_aligner
from riot_na.data.model import GermlineGene
from riot_na.alignment.alignment_utils import infer_reading_frame


igh_data = {}
with open(data_path + "igh_airr.json", 'r') as f:
    igh_data = json.load(f)

database_path = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/"
j_genes_path = os.path.join(database_path, "j_genes")
os.makedirs(j_genes_path, exist_ok=True)

j_genes_aa_path = os.path.join(database_path, "aa_genes", "j_genes", "custom")
os.makedirs(j_genes_aa_path, exist_ok=True)


ighj_human_aligner = create_aligner(Organism.HOMO_SAPIENS, germline_gene=GermlineGene.J, locus=Locus.IGH)
ighj_mouse_aligner = create_aligner(Organism.MUS_MUSCULUS, germline_gene=GermlineGene.J, locus=Locus.IGH)

with open(os.path.join(j_genes_path, "custom.fasta"), "w") as j_genes_file, open(os.path.join(j_genes_aa_path, "igh.fasta"), "w") as j_genes_aa_file:
    for allele_data in igh_data["GermlineSet"][0]["allele_descriptions"]:
        name = allele_data["label"]
        sequence = allele_data["coding_sequence"]
        sequence_type = allele_data["sequence_type"]        

        if sequence_type == "J":
            # align sequence to known j genes of the same locus to infer reading frame

            human_prefiltering_result = ighj_human_aligner._prefilter(sequence, False)
            human_alignments = ighj_human_aligner._align_sequences(sequence, human_prefiltering_result)

            mouse_prefiltering_result = ighj_mouse_aligner._prefilter(sequence, False)
            mouse_alignments = ighj_mouse_aligner._align_sequences(sequence, mouse_prefiltering_result)

            all_alignments = []

            if human_alignments:
                all_alignments.extend(human_alignments)
            
            if mouse_alignments:
                all_alignments.extend(mouse_alignments)

            if not all_alignments:
                print(f"Could not align sequence: {sequence}")
                print("Could not find any alignments")
                continue

            # get the best alignment
            all_alignments.sort()
            best_alignment = all_alignments[0]

            reading_frame = infer_reading_frame(best_alignment.t_start, best_alignment.reading_frame)
            j_genes_file.write(f">{name}\tIGH\t{reading_frame}\tCUSTOM\n{sequence}\n")

            translated_sequence = translate(sequence, reading_frame)
            j_genes_aa_file.write(f">{name}\tIGH\tCUSTOM\n{translated_sequence}\n")



In [102]:

# IGKV and IGLV should be just rewritten

igkj_human_aligner = create_aligner(Organism.HOMO_SAPIENS, germline_gene=GermlineGene.J, locus=Locus.IGK)
igkj_mouse_aligner = create_aligner(Organism.MUS_MUSCULUS, germline_gene=GermlineGene.J, locus=Locus.IGK)

igkj_records = SeqIO.parse(data_path + "igkj.fasta", "fasta")
with open(os.path.join(j_genes_path, "custom.fasta"), "a") as j_genes_file, open(os.path.join(j_genes_aa_path, "igk.fasta"), "w") as j_genes_aa_file:

    for record in igkj_records:

        name = record.id
        sequence = str(record.seq)

        human_prefiltering_result = igkj_human_aligner._prefilter(sequence, False)
        human_alignments = igkj_human_aligner._align_sequences(sequence, human_prefiltering_result)

        mouse_prefiltering_result = igkj_mouse_aligner._prefilter(sequence, False)
        mouse_alignments = igkj_mouse_aligner._align_sequences(sequence, mouse_prefiltering_result)

        all_alignments = []

        if human_alignments:
            all_alignments.extend(human_alignments)
        
        if mouse_alignments:
            all_alignments.extend(mouse_alignments)

        if not all_alignments:
            print(f"Could not align sequence: {sequence}")
            print("Could not find any alignments")
            continue

        # get the best alignment
        all_alignments.sort()
        best_alignment = all_alignments[0]

        reading_frame = infer_reading_frame(best_alignment.t_start, best_alignment.reading_frame) 
        j_genes_file.write(f">{record.id}\tIGK\t{reading_frame}\tCUSTOM\n{record.seq}\n")

        translated_sequence = translate(sequence, reading_frame)
        j_genes_aa_file.write(f">{name}\tIGK\tCUSTOM\n{translated_sequence}\n")





In [107]:

iglj_human_aligner = create_aligner(Organism.HOMO_SAPIENS, germline_gene=GermlineGene.J, locus=Locus.IGL)
iglj_mouse_aligner = create_aligner(Organism.MUS_MUSCULUS, germline_gene=GermlineGene.J, locus=Locus.IGL)

iglj_records = SeqIO.parse(data_path + "iglj.fasta", "fasta")


with open(os.path.join(j_genes_path, "custom.fasta"), "a") as j_genes_file, open(os.path.join(j_genes_aa_path, "igl.fasta"), "w") as j_genes_aa_file:

    for record in iglj_records:

        name = record.id
        sequence = str(record.seq)

        human_prefiltering_result = iglj_human_aligner._prefilter(sequence, False)
        human_alignments = iglj_human_aligner._align_sequences(sequence, human_prefiltering_result)

        mouse_prefiltering_result = iglj_mouse_aligner._prefilter(sequence, False)
        mouse_alignments = iglj_mouse_aligner._align_sequences(sequence, mouse_prefiltering_result)

        all_alignments = []

        if human_alignments:
            all_alignments.extend(human_alignments)
        
        if mouse_alignments:
            all_alignments.extend(mouse_alignments)

        if not all_alignments:
            print(f"Could not align sequence: {sequence}")
            print("Could not find any alignments")
            continue

        # get the best alignment
        all_alignments.sort()
        best_alignment = all_alignments[0]

        reading_frame = infer_reading_frame(best_alignment.t_start, best_alignment.reading_frame)
        j_genes_file.write(f">{record.id}\tIGL\t{reading_frame}\tCUSTOM\n{record.seq}\n")

        translated_sequence = translate(sequence, reading_frame)
        j_genes_aa_file.write(f">{name}\tIGL\tCUSTOM\n{translated_sequence}\n")



In [92]:

from riot_na.alignment.aa_gene_alignments import create_aa_j_gene_aligner
from riot_na.alignment.gene_aligner import GeneAlignerAA
from riot_na.data.model import AlignmentEntryAA


def infer_j_gene_scheme_mapping(aa_sequence, locus: Locus, scheme: Scheme):
    allowed_species = [Organism.HOMO_SAPIENS, Organism.MUS_MUSCULUS]
    aa_genes_dir = GENE_DB_DIR / "gene_db" / "aa_genes_deduplicated"

    j_aligners: dict[Organism, GeneAlignerAA] = {}

    for organism in allowed_species:
        j_aligner = create_aa_j_gene_aligner(organism=organism, locus=locus, aa_genes_dir=aa_genes_dir)
        j_aligners[organism] = j_aligner


    all_alignments = []

    for organism in allowed_species:
        j_aligner = j_aligners[organism]
        prefiltering_result = j_aligner._prefilter(aa_sequence)
        alignments = j_aligner._align_sequences(aa_sequence, prefiltering_result)

        if alignments:
            all_alignments.extend(alignments)


    if not all_alignments:
        print(f"Could not align sequence: {aa_sequence}")
        print(f"Could not find any alignments")
        return None

    # in this case we want to find the longest alignment
    # therefore we should sort the alignments by the length of the alignment (DESC), then e-value (ASC)
    # find longest alignment: 

    j_aln: AlignmentEntryAA = None
    longest_aln = None

    for aln in all_alignments:
        fixed_aln = force_n_terminus_matches(aln)
        fixed_aln = force_c_terminus_matches(aa_sequence, aln.t_seq, fixed_aln)

        aln_len = fixed_aln.q_end - fixed_aln.q_start
        if longest_aln is None or aln_len > longest_aln:
            longest_aln = aln_len
            j_aln = aln
        elif longest_aln is None or aln_len > longest_aln:
            if j_aln is None or aln.e_value < j_aln.e_value:
                j_aln = aln
                
    if longest_aln < len(aa_sequence):
        print(f"Could not align sequence: {aa_sequence}")
        print("Could not find alignment covering query sequence")

   
    j_scheme_mapping_facade = SchemeMappingFacade(scheme, allowed_species, GENE_DB_DIR)
    j_target_scheme_mapping = j_scheme_mapping_facade.get_mapping(j_aln.species, j_aln.target_id)

    fixed_aln = force_n_terminus_matches(j_aln)
    fixed_aln = force_c_terminus_matches(aa_sequence, j_aln.t_seq, fixed_aln)

    j_query_gene_alignment_str = unfold_cigar(fixed_aln.cigar)
    j_query_gene_alignment_str = collapse_alignment_str(j_query_gene_alignment_str)
    j_query_gene_alignment_str = force_n_terminus_del_ins(fixed_aln, j_query_gene_alignment_str)
    j_query_scheme_alignment_str = _merge_cigars(j_query_gene_alignment_str, j_target_scheme_mapping)

    return j_query_scheme_alignment_str


In [108]:
infer_j_gene_scheme_mapping("RFFFLKWPIVCR", Locus.IGL, Scheme.IMGT)

'MMMMMMMMMMMM'

In [114]:
j_genes_aa_path

'/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/aa_genes/j_genes/custom'

In [115]:
# prepare scheme mappings for j genes

for scheme in Scheme:
    for locus in Locus:

        print(f"Processing scheme: {scheme.value} {locus.value}")
        scheme_mapping_path = f"/home/pawel/workspace/riot_na/notebooks/data_processing/data/scheme_mappings/custom/{scheme.value}/"
        os.makedirs(scheme_mapping_path, exist_ok=True)

        scheme_mapping_file = f"/home/pawel/workspace/riot_na/notebooks/data_processing/data/scheme_mappings/custom/{scheme.value}/scheme_mapping.csv"

        # write csv header gene_id,scheme_cigar
        # scheme mappings are shared - do not create new file and write header 
        with open(scheme_mapping_file, "a") as scheme_mapping_file:

            j_genes_aa_path
            j_gene_aa_records = SeqIO.parse(os.path.join(j_genes_aa_path, f"{locus.value}.fasta"), "fasta")

            for record in j_gene_aa_records:
                aa_sequence = str(record.seq)
                j_query_scheme_alignment_str = infer_j_gene_scheme_mapping(aa_sequence, locus, scheme)

                if j_query_scheme_alignment_str is not None:
                    scheme_mapping_file.write(f"{record.id},{j_query_scheme_alignment_str}\n")

                else:
                    print(f"Could not infer scheme mapping for gene: {record.id}")

                # save to file
        

Processing scheme: kabat igh
Processing scheme: kabat igl
Processing scheme: kabat igk
Processing scheme: chothia igh
Processing scheme: chothia igl
Processing scheme: chothia igk
Processing scheme: imgt igh
Processing scheme: imgt igl
Processing scheme: imgt igk
Processing scheme: martin igh
Processing scheme: martin igl
Processing scheme: martin igk


In [118]:
# copy c genes for nt alignments from riot
c_genes_path = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/c_genes/"

os.makedirs(c_genes_path, exist_ok=True)

download_file("https://github.com/NaturalAntibody/riot_na/blob/master/riot_na/databases/gene_db/c_genes/human/igh.fasta", c_genes_path + "igh.fasta")
download_file("https://github.com/NaturalAntibody/riot_na/blob/master/riot_na/databases/gene_db/c_genes/human/igk.fasta", c_genes_path + "igk.fasta")
download_file("https://github.com/NaturalAntibody/riot_na/blob/master/riot_na/databases/gene_db/c_genes/human/igl.fasta", c_genes_path + "igl.fasta")


In [12]:
from riot_na.config import GENE_DB_DIR
from Bio import SeqIO
import pandas as pd
from pathlib import Path
import os


def df_to_fasta(df: pd.DataFrame, output_path: Path):
    with output_path.open("w") as output_file:
        for row in df.itertuples(index=False):
            output_file.write(f">{row.description}\n")
            output_file.write(f"{row.sequence}\n")


def deduplicate_genes(input_path) -> tuple[pd.DataFrame, pd.DataFrame]:
    df = pd.DataFrame.from_records(
        (
            {"allele_id": record.id, "description": record.description, "sequence": str(record.seq)}
            for record in SeqIO.parse(input_path, "fasta")
        )
    )
    df["allele"] = df["allele_id"].str.split("*").str[1]
    df["gene_id"] = df["allele_id"].str.split("*").str[0]

    df = df.sort_values(["gene_id", "allele"])
    deduplicated_df = df.drop_duplicates(subset=["sequence"])
    first_allele_df = deduplicated_df.groupby("gene_id").first()
    return deduplicated_df, first_allele_df


AA_GENES_DIR = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/aa_genes"
OUTPUT_GENES_DEDUP_DIR = "/home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/aa_genes_deduplicated"

input_path = Path(AA_GENES_DIR) /"v_genes"/ "custom.fasta"

deduplicated_df, first_allele_df = deduplicate_genes(input_path)
output_dir = Path(OUTPUT_GENES_DEDUP_DIR) / "v_genes"
output_dir.mkdir(exist_ok=True, parents=True)
df_to_fasta(deduplicated_df, output_dir / f"custom.fasta")

for input_path in (Path(AA_GENES_DIR) / "j_genes" / "custom").iterdir():

    deduplicated_df, first_allele_df = deduplicate_genes(input_path)
    output_dir = Path(OUTPUT_GENES_DEDUP_DIR) / "j_genes" / "custom"
    output_dir.mkdir(exist_ok=True, parents=True)
    df_to_fasta(deduplicated_df, output_dir / f"{input_path.stem}.fasta")


In [14]:
# api use: /home/pawel/workspace/riot_na/notebooks/data_processing/data/gene_db/

from dataclasses import asdict
from riot_na.schemes.scheme_alignment import SchemeAligner
from riot_na.alignment.aa_gene_alignments import create_vj_aligner_aa
from riot_na.api.riot_numbering import RiotNumberingAA
from pathlib import Path

from riot_na.data.model import Organism, Scheme
import json


custom_db_dir = Path("/home/pawel/workspace/riot_na/notebooks/data_processing/data/")

scheme_alnr = SchemeAligner(db_dir=custom_db_dir, allowed_species=[Organism.CUSTOM])

vdj_aa_alnr = create_vj_aligner_aa(allowed_species=[Organism.CUSTOM], db_dir=custom_db_dir)
aa_numbering = RiotNumberingAA(vdj_aa_alnr, scheme_alnr)

AA_QUERY = "EVQVVSGGGVVQPGRSLRLSCTASGFTFSNFAMGWVRQAPGKGLEWVAFISSDGSNKNYGDSVKGRFTISRDNSKNTVFLQMNSLRVEDTALYYYCAKDVGGAFDLWGQGTYMVTVSP"

aa_sample_result = aa_numbering.run_on_sequence("header", AA_QUERY, Scheme.IMGT)
print(json.dumps(asdict(aa_sample_result), indent=4))


{
    "sequence_header": "header",
    "sequence_aa": "EVQVVSGGGVVQPGRSLRLSCTASGFTFSNFAMGWVRQAPGKGLEWVAFISSDGSNKNYGDSVKGRFTISRDNSKNTVFLQMNSLRVEDTALYYYCAKDVGGAFDLWGQGTYMVTVSP",
    "numbering_scheme": "imgt",
    "locus": "igh",
    "stop_codon": false,
    "productive": true,
    "complete_vdj": true,
    "v_call": "IGHV0-D4D7*00",
    "j_call": "IGHJ0-TXGH*00",
    "germline_alignment_aa": "EVQLVESGGGLVKPGGSLKLSCAASGFTFSDYGMHWVRQAPEKGLEWVAYISSGSSTIYYADTVKGRFTISRDNAKNTLFLQMTSLRSEDTAMYYCARNNNNNNAMDYWGQGTSVTVS",
    "sequence_alignment_aa": "EVQVVSGGGVVQPGRSLRLSCTASGFTFSNFAMGWVRQAPGKGLEWVAFISSDGSNKNYGDSVKGRFTISRDNSKNTVFLQMNSLRVEDTALYYYCAKDVGGAFDLWGQGTYMVTVSP",
    "v_alignment_start_aa": 1,
    "v_alignment_end_aa": 98,
    "j_alignment_start_aa": 103,
    "j_alignment_end_aa": 117,
    "v_sequence_alignment_aa": "EVQVVSGGGVVQPGRSLRLSCTASGFTFSNFAMGWVRQAPGKGLEWVAFISSDGSNKNYGDSVKGRFTISRDNSKNTVFLQMNSLRVEDTALYYYCAK",
    "v_germline_alignment_aa": "EVQLVESGGGLVKPGGSLKLSCAASGFTFSDYGMHWVRQAPEK

In [None]:


# translate nt sequences to amino acids and split to the following structure
# gene_db/aa_genes/v_genes/organism.fasta
# gene_db/aa_genes/j_genes/organism/igh.fasta
# gene_db/aa_genes/j_genes/organism/igk.fasta
# gene_db/aa_genes/j_genes/organism/igl.fasta



In [None]:
# for aa alignments we need to deduplicate the sequences (since aa sequences might be the same, we are unable to determine which gene is the correct one - so we dediplicate the sequences so we do not choke the prefiltering)
# gene_db/aa_genes_deduplicated/v_genes/organism.fasta
# gene_db/aa_genes_deduplicated/j_genes/organism/igh.fasta
# gene_db/aa_genes_deduplicated/j_genes/organism/igk.fasta
# gene_db/aa_genes_deduplicated/j_genes/organism/igl.fasta


In [None]:
# nt v alleles: >IGHV4-28*07	IGH	0	HOMO_SAPIENS
# nt d alleles: >IGHD2-2*02	HOMO_SAPIENS
# nt j alleles: >IGHJ3*02	IGH	1	HOMO_SAPIENS

# aa v alleles: >IGHV1-45*03	IGH	HOMO_SAPIENS
# aa j alleles: >IGHJ6*03	IGH	HOMO_SAPIENS
