In [1]:
from src.core_variant_translate import CVCTranslator
from src.api.seqrepo_api import SeqRepoAPI
cn = SeqRepoAPI("https://services.genomicmedlab.org/seqrepo")
dp = cn.dp
varTrans = CVCTranslator()



In [None]:
spdi = 'NM_001267550.2:80230:G:A'

print(varTrans.spdi_to_cvc(spdi))

In [3]:
hgvs = "NC_000001.11:g.943043C>T"
l = ["NC_000007.14:g.55181230_55181231insGGCT","NC_000019.10:g.44908822C>T","NC_000007.14:g.55181220del","NC_000023.11:g.32386323delinsGA","NC_000013.11:g.32936732=","NC_000013.11:g.19993838_19993839dup"]

for hgvs in l:
    print(hgvs)
    print(varTrans.hgvs_to_cvc(hgvs))

NC_000007.14:g.55181230_55181231insGGCT
CoreVariantClass(0-based interbase,DNA,,GGCT,55181230,55181231,None,None,None,None,None,NC_000007.14,{})
NC_000019.10:g.44908822C>T
CoreVariantClass(0-based interbase,DNA,C,T,44908821,44908822,None,None,None,None,None,NC_000019.10,{})
NC_000007.14:g.55181220del
CoreVariantClass(0-based interbase,DNA,T,,55181219,55181220,None,None,None,None,None,NC_000007.14,{})
NC_000023.11:g.32386323delinsGA
CoreVariantClass(0-based interbase,DNA,T,GA,32386322,32386323,None,None,None,None,None,NC_000023.11,{})
NC_000013.11:g.32936732=
CoreVariantClass(0-based interbase,DNA,C,C,32936731,32936732,None,None,None,None,None,NC_000013.11,{})
NC_000013.11:g.19993838_19993839dup
CoreVariantClass(0-based interbase,DNA,GT,GTGT,19993837,19993839,None,None,None,None,None,NC_000013.11,{})


In [2]:
import pandas as pd

import hgvs.parser
import hgvs.dataproviders.uta
# Validating variants: composed of two classes: hgvs.validator.IntrinsicValidator and hgvs.validator.ExtrinsicValidator
# Intrinsic validation evaluates a given variant for internal consistency, such as requiring that insertions specify adjacent positions. 
# Extrinsic validation evaluates a variant using external data, such as ensuring that the reference nucleotide in the variant matches that implied by the reference sequence and position.
import hgvs.validator
# Normalization is always 3’ with respect to the reference sequence. 
import hgvs.normalizer
from hgvs.exceptions import HGVSError


hp = hgvs.parser.Parser()
hdp = hgvs.dataproviders.uta.connect()
hn = hgvs.normalizer.Normalizer(hdp)
vr = hgvs.validator.Validator(hdp=hdp)

# hgvs_expr1 = "NC_000001.11:g.943043C>T"#"NM_000097.7:c.814A>C"

# def val_hgvs_expression(expression):
#     parsed_variant = hp.parse_hgvs_variant(hgvs_expr1)
#     if vr.validate(parsed_variant):
#         return parsed_variant 
    
# val_hgvs_expression(hgvs_expr1)

# hgvs_expr1 = "NC_000013.11:g.32936732="#"NM_000097.7:c.814A>C"
# parsed_variant = hp.parse_hgvs_variant(hgvs_expr1)
# if not vr.validate(parsed_variant):
#     raise(ValueError("Invalid HGVS expression: %s" % hgvs_expr1)) 
# # parsed_variant.posedit.edit.ref
# parsed_variant.posedit.edit.alt

In [None]:
l = ["NC_000007.14:g.55181230_55181231insGGCT","NC_000019.10:g.44908822C>T","NC_000007.14:g.55181220del","NC_000023.11:g.32386323delinsGA","NC_000013.11:g.32936732=","NC_000013.11:g.19993838_19993839dup"]
for i in l:
    parsed_variant = hp.parse_hgvs_variant(i)
    print(parsed_variant.posedit.edit.type)

In [None]:
insertion = "NC_000007.14:g.55181230_55181231insGGCT"
insertion_parse = hp.parse_hgvs_variant(insertion)

sequence = dp.get_sequence(insertion_parse.ac,insertion_parse.posedit.pos.start.base,insertion_parse.posedit.pos.end.base)
print(sequence)
print('Insertion Example')
print(f"ALTERNATIVE: {insertion_parse.posedit.edit.alt}, REFERENCE: {insertion_parse.posedit.edit.ref}")


# When the type is insertion: 
    # hgvs.posedit.edit.alt is the alternative sequence inserted at the given position
    # hgvs.posedit.edit.ref gives back a value of NONE, Because we are just referring to just a position.
    # so we need to be able to handle this case.

# hgvs insertion convertion to CVC format
if parsed_variant.posedit.edit.type == "ins":
    # start and end are not change with insertion
    start_pos = parsed_variant.posedit.pos.start.base
    end_pos = parsed_variant.posedit.pos.end.base

    alt_seq = parsed_variant.posedit.edit.alt
    ref_seq = parsed_variant.posedit.edit.ref or ""



In [None]:
# def testoutput(expression):

#     parsed_variant = hp.parse_hgvs_variant(insertion)
#     if parsed_variant.posedit.edit.type == "ins":
#         start_pos = parsed_variant.posedit.pos.start.base
#         end_pos = parsed_variant.posedit.pos.end.base

#         alt_seq = parsed_variant.posedit.edit.alt
#         ref_seq = parsed_variant.posedit.edit.ref or ""
#     return ref_seq

# x = testoutput(insertion)
# if x == "":
#     print('yes')

In [None]:
if insertion_parse.posedit.edit.ref == None:
    print('this is a none value')

In [None]:
sub = "NC_000019.10:g.44908822C>T"
sub_parse = hp.parse_hgvs_variant(sub)
print('Substitution Example')
print(f"ALTERNATIVE: {sub_parse.posedit.edit.alt}, REFERENCE: {sub_parse.posedit.edit.ref}")

if parsed_variant.posedit.edit.type == "sub":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 

    end_pos = parsed_variant.posedit.pos.end.base

    alt_seq = parsed_variant.posedit.edit.alt
    ref_seq = parsed_variant.posedit.edit.ref

In [None]:
dell = "NC_000007.14:g.55181220del"
dell_parse = hp.parse_hgvs_variant(dell)
sequence_del = dp.get_sequence(dell_parse.ac,dell_parse.posedit.pos.start.base-1,dell_parse.posedit.pos.end.base)
print(sequence_del)
print('Deletion Example')
print(f"ALTERNATIVE: {dell_parse.posedit.edit.alt}, REFERENCE: {dell_parse.posedit.edit.ref}")

# reference sequence is whats being deleted. 
# the atlernative sequence is nothing

if parsed_variant.posedit.edit.type == "del":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 
    end_pos = parsed_variant.posedit.pos.end.base

    ref_seq = dp.get_sequence(dell_parse.ac,dell_parse.posedit.pos.start.base-1,dell_parse.posedit.pos.end.base)
    alt_seq = parsed_variant.posedit.edit.alt or ""


In [None]:
delins = "NC_000023.11:g.32386323delinsGA"
delins_parse = hp.parse_hgvs_variant(delins)
sequence_delins_parse = dp.get_sequence(delins_parse.ac,delins_parse.posedit.pos.start.base-1,delins_parse.posedit.pos.end.base)
print(sequence_delins_parse)
print('DeletionInsertion Example')
print(f"ALTERNATIVE: {delins_parse.posedit.edit.alt}, REFERENCE: {delins_parse.posedit.edit.ref}")

if parsed_variant.posedit.edit.type == "delins":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 
    end_pos = parsed_variant.posedit.pos.end.base

    ref_seq = dp.get_sequence(dell_parse.ac,start_pos,end_pos)
    alt_seq = parsed_variant.posedit.edit.alt

In [None]:
identity = "NC_000013.11:g.32936732="
identity_parse = hp.parse_hgvs_variant(identity)
sequence_identity = dp.get_sequence(identity_parse.ac,identity_parse.posedit.pos.start.base-1,identity_parse.posedit.pos.end.base)

print(sequence_identity)
print('Identity Example')
print(f"ALTERNATIVE: {identity_parse.posedit.edit.alt}, REFERENCE: {identity_parse.posedit.edit.ref}")


# if parsed_variant.posedit.edit.type == "identity":
#     # The starting position is changed to account for 0-interbase indexing
#     start_pos = parsed_variant.posedit.pos.start.base - 1 
#     end_pos = parsed_variant.posedit.pos.end.base

#     ref_seq = parsed_variant.posedit.edit.alt
#     alt_seq = parsed_variant.posedit.edit.alt

In [None]:
duplication = "NC_000013.11:g.19993838_19993839dup"
duplication_parse = hp.parse_hgvs_variant(duplication)
sequence_duplication = dp.get_sequence(duplication_parse.ac,duplication_parse.posedit.pos.start.base,duplication_parse.posedit.pos.end.base)

print(sequence_duplication)
print('duplication Example')
# print(f"ALTERNATIVE: {duplication_parse.posedit.edit.alt}, REFERENCE: {duplication_parse.posedit.edit.ref}")


ref = dp.get_sequence(duplication_parse.ac,duplication_parse.posedit.pos.start.base-1,duplication_parse.posedit.pos.end.base)
print(ref)

state = ref + ref
print(state)

if parsed_variant.posedit.edit.type == "dup":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 
    end_pos = parsed_variant.posedit.pos.end.base

    ref_seq = dp.get_sequence(duplication_parse.ac,start_pos,end_pos)
    alt_seq = ref_seq + ref_seq

In [None]:
# HGVS insertion conversion to CVC format
if parsed_variant.posedit.edit.type == "ins":
    # start and end are not change with insertion
    start_pos = parsed_variant.posedit.pos.start.base
    end_pos = parsed_variant.posedit.pos.end.base

    alt_seq = parsed_variant.posedit.edit.alt
    ref_seq = parsed_variant.posedit.edit.ref or ""
    
# HGVS substitution conversion to CVC format
if parsed_variant.posedit.edit.type == "sub":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 

    end_pos = parsed_variant.posedit.pos.end.base

    alt_seq = parsed_variant.posedit.edit.alt
    ref_seq = parsed_variant.posedit.edit.ref
    
# HGVS deletion conversion to CVC format
if parsed_variant.posedit.edit.type == "del":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 
    end_pos = parsed_variant.posedit.pos.end.base

    ref_seq = dp.get_sequence(dell_parse.ac,dell_parse.posedit.pos.start.base-1,dell_parse.posedit.pos.end.base)
    alt_seq = parsed_variant.posedit.edit.alt or ""

# HGVS deletion insertion conversion to CVC format
if parsed_variant.posedit.edit.type == "delins":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 
    end_pos = parsed_variant.posedit.pos.end.base

    ref_seq = dp.get_sequence(dell_parse.ac,start_pos,end_pos)
    alt_seq = parsed_variant.posedit.edit.alt

# HGVS identity conversion to CVC format
if parsed_variant.posedit.edit.type == "identity":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 
    end_pos = parsed_variant.posedit.pos.end.base

    ref_seq = parsed_variant.posedit.edit.alt
    alt_seq = parsed_variant.posedit.edit.alt
    
# HGVS duplication conversion to CVC format
if parsed_variant.posedit.edit.type == "dup":
    # The starting position is changed to account for 0-interbase indexing
    start_pos = parsed_variant.posedit.pos.start.base - 1 
    end_pos = parsed_variant.posedit.pos.end.base

    ref_seq = dp.get_sequence(duplication_parse.ac,start_pos,end_pos)
    alt_seq = ref_seq + ref_seq

In [4]:
# "search_term": "NC_000007.14:g.55181230_55181231insGGCT",

# {
#     "_id": "ga4gh:VA.JKGCs07cFu2wlDydCAe2ea06jMFXyK56",
#     "type": "Allele",
#     "location": {
#     "_id": "ga4gh:VSL.SdvAZCNKh5kf6ClsiOOmw_88fbkFPTqG",
#     "type": "SequenceLocation",
#     "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
#     "interval": {
#         "type": "SequenceInterval",
#         "start": {
#         "type": "Number",
#         "value": 55181230
#         },
#         "end": {
#         "type": "Number",
#         "value": 55181230
#         }
#     }
#     },
#     "state": {
#     "type": "LiteralSequenceExpression",
#     "sequence": "GGCT"
#     }
# }

# "search_term": "NC_000019.10:g.44908822C>T"


# {
#     "_id": "ga4gh:VA.CxiA_hvYbkD8Vqwjhx5AYuyul4mtlkpD",
#     "type": "Allele",
#     "location": {
#     "_id": "ga4gh:VSL.QrRSuBj-VScAGV_gEdxNgsnh41jYH1Kg",
#     "type": "SequenceLocation",
#     "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
#     "interval": {
#         "type": "SequenceInterval",
#         "start": {
#         "type": "Number",
#         "value": 44908821
#         },
#         "end": {
#         "type": "Number",
#         "value": 44908822
#         }
#     }
#     },
#     "state": {
#     "type": "LiteralSequenceExpression",
#     "sequence": "T"
#     }
# }

# "search_term": "NC_000007.14:g.55181220del"

# {
#     "_id": "ga4gh:VA.h6WuolTwZJYZh86qP2a8YVA1WXpHuY_X",
#     "type": "Allele",
#     "location": {
#     "_id": "ga4gh:VSL.kB_ok6Eka0225QwwbOKtvcYZBz7Z0mSR",
#     "type": "SequenceLocation",
#     "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
#     "interval": {
#         "type": "SequenceInterval",
#         "start": {
#         "type": "Number",
#         "value": 55181219
#         },
#         "end": {
#         "type": "Number",
#         "value": 55181220
#         }
#     }
#     },
#     "state": {
#     "type": "LiteralSequenceExpression",
#     "sequence": ""
#     }
# }

# "search_term": "NC_000023.11:g.32386323delinsGA",

# {
#     "_id": "ga4gh:VA.HH3RHjZymrie-09X8aR2SMf1ULMlee6u",
#     "type": "Allele",
#     "location": {
#     "_id": "ga4gh:VSL.tS45HvJapFexhxmbHe6SBn7dGuC46sni",
#     "type": "SequenceLocation",
#     "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
#     "interval": {
#         "type": "SequenceInterval",
#         "start": {
#         "type": "Number",
#         "value": 32386322
#         },
#         "end": {
#         "type": "Number",
#         "value": 32386323
#         }
#     }
#     },
#     "state": {
#     "type": "LiteralSequenceExpression",
#     "sequence": "GA"
#     }
# }

# "search_term": "NC_000013.11:g.32936732="
# {
#     "_id": "ga4gh:VA.DkZLLMnwoH6zIncSRh2c05nzCNLdTqHl",
#     "type": "Allele",
#     "location": {
#     "_id": "ga4gh:VSL.iSZclbNW8T95cXDuNvLMvm6xJd2g4pTn",
#     "type": "SequenceLocation",
#     "sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
#     "interval": {
#         "type": "SequenceInterval",
#         "start": {
#         "type": "Number",
#         "value": 32936731
#         },
#         "end": {
#         "type": "Number",
#         "value": 32936732
#         }
#     }
#     },
#     "state": {
#     "type": "LiteralSequenceExpression",
#     "sequence": "C"
#     }
# }


"search_term": "NC_000013.11:g.19993838_19993839dup",

{
    "_id": "ga4gh:VA.S3eUS2hlp6q4pSv4u2CbN0OPMusMUnHZ",
    "type": "Allele",
    "location": {
    "_id": "ga4gh:VSL.nW80UuWsc9bgMncP24FLII6qx8aouNki",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
    "interval": {
        "type": "SequenceInterval",
        "start": {
        "type": "Number",
        "value": 19993837
        },
        "end": {
        "type": "Number",
        "value": 19993839
        }
    }
    },
    "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "GTGT"
    }
}


SyntaxError: illegal target for annotation (3203764587.py, line 1)

In [13]:
# ref_sequence_id = translated_sequence_ids[0].split(":")[1]
reference_allele = str(
    dp.get_sequence(
        'NC_000013.11',
        19993837,
        19993839,
    )
)
reference_allele
# TODO: look into this: https://github.com/ga4gh/vrs-python/blob/593508c6e8229336ca1f53a06f69966020cd68f7/src/ga4gh/vrs/extras/translator.py#L412
# alternative_allele = (str(expression.state.sequence) if expression.state.sequence else None)


'GT'