This package, hgvs, is an easy-to-use Python library for parsing, representing, formatting, and mapping variants between genome, transcript, and protein sequences.

In [1]:
import pandas as pd

import hgvs.parser
import hgvs.dataproviders.uta
# Validating variants: composed of two classes: hgvs.validator.IntrinsicValidator and hgvs.validator.ExtrinsicValidator
# Intrinsic validation evaluates a given variant for internal consistency, such as requiring that insertions specify adjacent positions. 
# Extrinsic validation evaluates a variant using external data, such as ensuring that the reference nucleotide in the variant matches that implied by the reference sequence and position.
import hgvs.validator
# Normalization is always 3’ with respect to the reference sequence. 
import hgvs.normalizer
from hgvs.exceptions import HGVSError


hp = hgvs.parser.Parser()
hdp = hgvs.dataproviders.uta.connect()
hn = hgvs.normalizer.Normalizer(hdp)
vr = hgvs.validator.Validator(hdp=hdp)

In [2]:
def validate_hgvs_variants(expression):
    try:
        parsed_variant = hp.parse_hgvs_variant(expression)
        vr.validate(parsed_variant)
        return expression
    except HGVSError as e:
        return e
validate_hgvs_variants('NC_000001.10:g.>A')

In [171]:
excel_file = '../data/LabHgvsExpression.xlsx'

with pd.ExcelFile(excel_file) as hgvs_path:
    input_data = pd.read_excel(hgvs_path)

input_data['oringal_hgvs_expression_example'] = input_data['oringal_hgvs_expression_example'].str.strip()
input_data['edited_hgvs_expression_expample'] = input_data['edited_hgvs_expression_expample'].str.strip()

data = pd.DataFrame(input_data)



#NOTE: Boolean Values where obtained from: https://mutalyzer.nl/
# Description: The Normalizer takes a variant description as input and checks whether it is correct.


In [172]:
data

Unnamed: 0,oringal_hgvs_expression_example,edited_hgvs_expression_expample,mutalyzer_original_hgvs_checker,mutalyzer_edited_hgvs_checker
0,NM_001267550.2:c.80006G>A (p.Ser26669Asn),NM_001267550.2:c.80006G>A,False,True
1,c.80006G>A (p.Ser26669Asn),c.80006G>A,False,False
2,variant in the TTN(NM_001267550.2) gene,TTN(NM_001267550.2),False,False
3,NM_001267550.2(TTN):c.80006G>A (p.Ser26669Asn),NM_001267550.2(TTN):c.80006G>A,False,True
4,NM_001267550.2:c.80006G>A p.Ser26669Asn,NM_001267550.2:c.80006G>A,False,True
5,NM_001267550.2 p.Ser26669Asn,NM_001267550.2:p.Ser26669Asn,False,True
6,NM_000059.4:c.5737T>C (p.Cys1913Arg),NM_000059.4:c.5737T>C,False,True
7,c.5737T>C (p.C1913R) in BRCA2,c.5737T>C,False,False
8,BRCA2:c.5737T>C p.C1913R,BRCA2:p.C1913R,False,False
9,BRCA2 c.5737T>C,BRCA2:c.5737T>C,False,False


In [173]:
original_expression = data['oringal_hgvs_expression_example']
edited_expression = data['edited_hgvs_expression_expample']

In [174]:
def validate_hgvs_variants(hgvs_list):
    # boolean_checker = []
    error_messages = []

    for hgvs in hgvs_list:
        try:
            parsed_variant = hp.parse_hgvs_variant(hgvs)
            vr.validate(parsed_variant)
            # boolean_checker.append()
            error_messages.append('Passed')
        except HGVSError as e:
            # boolean_checker.append(False)
            error_messages.append(e) 
    
    return boolean_checker, error_messages

In [175]:
original_boolean_checker, original_error_messages = validate_hgvs_variants(original_expression)
edited_boolean_checker, edited_error_messages = validate_hgvs_variants(edited_expression)

hgvs_results = {
    # 'biocommons_hgvs_original_hgvs_checker':original_boolean_checker,
    'biocommons_hgvs_original_hgvs_error_messages':original_error_messages,
    # 'biocommons_hgvs_edited_hgvs_checker':edited_boolean_checker,
    'biocommons_hgvs_edited_hgvs_error_messages':edited_error_messages
}

biocommons_hgvs_test = pd.DataFrame(hgvs_results)

In [176]:
biocommons_hgvs_test

Unnamed: 0,biocommons_hgvs_original_hgvs_error_messages,biocommons_hgvs_edited_hgvs_error_messages
0,NM_001267550.2:c.80006G>A (p.Ser26669Asn): cha...,Passed
1,c.80006G>A (p.Ser26669Asn): char 1: expected a...,c.80006G>A: char 1: expected a letter or digit
2,variant in the TTN(NM_001267550.2) gene: char ...,TTN(NM_001267550.2): char 6: expected one of '...
3,NM_001267550.2(TTN):c.80006G>A (p.Ser26669Asn)...,Passed
4,NM_001267550.2:c.80006G>A p.Ser26669Asn: char ...,Passed
5,NM_001267550.2 p.Ser26669Asn: char 14: expecte...,Accession (NM_001267550.2) is not compatible w...
6,NM_000059.4:c.5737T>C (p.Cys1913Arg): char 21:...,No transcript definition for (tx_ac=NM_000059.4)
7,c.5737T>C (p.C1913R) in BRCA2: char 1: expecte...,c.5737T>C: char 1: expected a letter or digit
8,BRCA2:c.5737T>C p.C1913R: char 15: expected EOF,Accession (BRCA2) is not known to be compatibl...
9,"BRCA2 c.5737T>C: char 5: expected one of '(', ...",Accession (BRCA2) is not known to be compatibl...


In [177]:
result = pd.concat([data, biocommons_hgvs_test], axis=1, ignore_index=False)
result

Unnamed: 0,oringal_hgvs_expression_example,edited_hgvs_expression_expample,mutalyzer_original_hgvs_checker,mutalyzer_edited_hgvs_checker,biocommons_hgvs_original_hgvs_error_messages,biocommons_hgvs_edited_hgvs_error_messages
0,NM_001267550.2:c.80006G>A (p.Ser26669Asn),NM_001267550.2:c.80006G>A,False,True,NM_001267550.2:c.80006G>A (p.Ser26669Asn): cha...,Passed
1,c.80006G>A (p.Ser26669Asn),c.80006G>A,False,False,c.80006G>A (p.Ser26669Asn): char 1: expected a...,c.80006G>A: char 1: expected a letter or digit
2,variant in the TTN(NM_001267550.2) gene,TTN(NM_001267550.2),False,False,variant in the TTN(NM_001267550.2) gene: char ...,TTN(NM_001267550.2): char 6: expected one of '...
3,NM_001267550.2(TTN):c.80006G>A (p.Ser26669Asn),NM_001267550.2(TTN):c.80006G>A,False,True,NM_001267550.2(TTN):c.80006G>A (p.Ser26669Asn)...,Passed
4,NM_001267550.2:c.80006G>A p.Ser26669Asn,NM_001267550.2:c.80006G>A,False,True,NM_001267550.2:c.80006G>A p.Ser26669Asn: char ...,Passed
5,NM_001267550.2 p.Ser26669Asn,NM_001267550.2:p.Ser26669Asn,False,True,NM_001267550.2 p.Ser26669Asn: char 14: expecte...,Accession (NM_001267550.2) is not compatible w...
6,NM_000059.4:c.5737T>C (p.Cys1913Arg),NM_000059.4:c.5737T>C,False,True,NM_000059.4:c.5737T>C (p.Cys1913Arg): char 21:...,No transcript definition for (tx_ac=NM_000059.4)
7,c.5737T>C (p.C1913R) in BRCA2,c.5737T>C,False,False,c.5737T>C (p.C1913R) in BRCA2: char 1: expecte...,c.5737T>C: char 1: expected a letter or digit
8,BRCA2:c.5737T>C p.C1913R,BRCA2:p.C1913R,False,False,BRCA2:c.5737T>C p.C1913R: char 15: expected EOF,Accession (BRCA2) is not known to be compatibl...
9,BRCA2 c.5737T>C,BRCA2:c.5737T>C,False,False,"BRCA2 c.5737T>C: char 5: expected one of '(', ...",Accession (BRCA2) is not known to be compatibl...


In [178]:
result.to_csv('../data/results.csv', index=False)

In [169]:
import hgvs.validator
import hgvs.exceptions
vr = hgvs.validator.Validator(hdp=hdp)
try:
    vr.validate( hp.parse_hgvs_variant('NM_000371.4:c.220G>C') )
except hgvs.exceptions.HGVSError as e:
    print(e)


No transcript definition for (tx_ac=NM_000371.4)


In [None]:
hgvs_list = ['NC_000017.11:g.43091687delC', 'NC_000007.13.g.21726874G>A']

try:
    parser = hgvs.parser.Parser()
    validator = hgvs.validator.IntrinsicValidator()
    
    for hgvs in hgvs_list:
        variant = parser.parse_hgvs_variant(hgvs)
        validator.validate(variant)
except Exception as e:
    print(e)

import hgvs.validator
import hgvs.exceptions
vr = hgvs.validator.Validator(hdp=hdp)
try:
    vr.validate( hp.parse_hgvs_variant('NM_001267550.2(TTN):c.80006G>A') )
except hgvs.exceptions.HGVSError as e:
    print(e)


from hgvs.exceptions import HGVSError
import hgvs.parser
import hgvs.validator

hgvs_list = ('NC_000017.11:g.43091687delC', 'NC_000007.13.g.21726874G>A')

# Create a validator instance
vr = hgvs.validator.Validator(hdp=hdp)

# Create a parser instance
hp = hgvs.parser.Parser()

try:
    for hgvs in hgvs_list:
        vr.validate(hp.parse_hgvs_variant(hgvs))
        print('expression passed: {}'.format(hgvs))
except HGVSError as e:
    print(e)
