This package, hgvs, is an easy-to-use Python library for parsing, representing, formatting, and mapping variants between genome, transcript, and protein sequences.

In [1]:
import pandas as pd

import hgvs.parser
import hgvs.dataproviders.uta
# Validating variants: composed of two classes: hgvs.validator.IntrinsicValidator and hgvs.validator.ExtrinsicValidator
# Intrinsic validation evaluates a given variant for internal consistency, such as requiring that insertions specify adjacent positions. 
# Extrinsic validation evaluates a variant using external data, such as ensuring that the reference nucleotide in the variant matches that implied by the reference sequence and position.
import hgvs.validator
# Normalization is always 3’ with respect to the reference sequence. 
import hgvs.normalizer
from hgvs.exceptions import HGVSError


hp = hgvs.parser.Parser()
hdp = hgvs.dataproviders.uta.connect()
hn = hgvs.normalizer.Normalizer(hdp)
vr = hgvs.validator.Validator(hdp=hdp)

In [8]:
def validate_hgvs_variants(expression):
    try:
        parsed_variant = hp.parse_hgvs_variant(expression)
        vr.validate(parsed_variant)
        return expression
    except HGVSError as e:
        return e
validate_hgvs_variants('NM_004343.4:c.1154_1155ins5')

hgvs.exceptions.HGVSParseError('NM_004343.4:c.1154_1155ins5: char 26: Syntax error')

In [None]:
excel_file = '../data/LabHgvsExpression.xlsx'

with pd.ExcelFile(excel_file) as hgvs_path:
    input_data = pd.read_excel(hgvs_path)

input_data['oringal_hgvs_expression_example'] = input_data['oringal_hgvs_expression_example'].str.strip()
input_data['edited_hgvs_expression_expample'] = input_data['edited_hgvs_expression_expample'].str.strip()

data = pd.DataFrame(input_data)



#NOTE: Boolean Values where obtained from: https://mutalyzer.nl/
# Description: The Normalizer takes a variant description as input and checks whether it is correct.


In [None]:
data

In [None]:
original_expression = data['oringal_hgvs_expression_example']
edited_expression = data['edited_hgvs_expression_expample']

In [None]:
def validate_hgvs_variants(hgvs_list):
    # boolean_checker = []
    error_messages = []

    for hgvs in hgvs_list:
        try:
            parsed_variant = hp.parse_hgvs_variant(hgvs)
            vr.validate(parsed_variant)
            # boolean_checker.append()
            error_messages.append('Passed')
        except HGVSError as e:
            # boolean_checker.append(False)
            error_messages.append(e) 
    
    return boolean_checker, error_messages

In [None]:
original_boolean_checker, original_error_messages = validate_hgvs_variants(original_expression)
edited_boolean_checker, edited_error_messages = validate_hgvs_variants(edited_expression)

hgvs_results = {
    # 'biocommons_hgvs_original_hgvs_checker':original_boolean_checker,
    'biocommons_hgvs_original_hgvs_error_messages':original_error_messages,
    # 'biocommons_hgvs_edited_hgvs_checker':edited_boolean_checker,
    'biocommons_hgvs_edited_hgvs_error_messages':edited_error_messages
}

biocommons_hgvs_test = pd.DataFrame(hgvs_results)

In [None]:
biocommons_hgvs_test

In [None]:
result = pd.concat([data, biocommons_hgvs_test], axis=1, ignore_index=False)
result

In [None]:
result.to_csv('../data/results.csv', index=False)

In [5]:
import pandas as pd

import hgvs.parser
import hgvs.dataproviders.uta
import hgvs.validator
from hgvs.exceptions import HGVSError

hp = hgvs.parser.Parser()
hdp = hgvs.dataproviders.uta.connect()
vr = hgvs.validator.Validator(hdp=hdp)

# TODO:open different type of files. If the file format doesn't work then throw an error.
excel_file = '../data/LabHgvsExpression.xlsx'
with pd.ExcelFile(excel_file) as hgvs_path:
    input_data = pd.read_excel(hgvs_path)


def validate_hgvs_variants(hgvs_list):
    error_messages = []

    for hgvs in hgvs_list:
        try:
            parsed_variant = hp.parse_hgvs_variant(hgvs)
            vr.validate(parsed_variant)
            error_messages.append(True)
        except HGVSError as e:
            error_messages.append(e) 
    
    return error_messages

#TODO:user inputes the column name of there hgvs expression
input_data['oringal_hgvs_expression_example'] = input_data['oringal_hgvs_expression_example'].str.strip()

data = pd.DataFrame(input_data)
hgvs_expression = data['edited_hgvs_expression_expample']

original_error_messages = validate_hgvs_variants(hgvs_expression)

#TODO: create output file.
hgvs_results = {'HGVS':data['oringal_hgvs_expression_example'],
                'biocommons_validator':original_error_messages}
biocommons_hgvs_test = pd.DataFrame(hgvs_results)
biocommons_hgvs_test

Unnamed: 0,HGVS,biocommons_validator
0,NM_001267550.2:c.80006G>A (p.Ser26669Asn),True
1,c.80006G>A (p.Ser26669Asn),c.80006G>A: char 1: expected a letter or digit
2,variant in the TTN(NM_001267550.2) gene,TTN(NM_001267550.2): char 6: expected one of '...
3,NM_001267550.2(TTN):c.80006G>A (p.Ser26669Asn),NM_001267550.2(TTN):c.80006G>A : char 30: expe...
4,NM_001267550.2:c.80006G>A p.Ser26669Asn,True
5,NM_001267550.2 p.Ser26669Asn,Accession (NM_001267550.2) is not compatible w...
6,NM_000059.4:c.5737T>C (p.Cys1913Arg),No transcript definition for (tx_ac=NM_000059.4)
7,c.5737T>C (p.C1913R) in BRCA2,c.5737T>C : char 1: expected a letter or digit
8,BRCA2:c.5737T>C p.C1913R,BRCA2:p.C1913R : char 15: expected EOF
9,BRCA2 c.5737T>C,Accession (BRCA2) is not known to be compatibl...


In [None]:
import hgvs.validator
import hgvs.exceptions
vr = hgvs.validator.Validator(hdp=hdp)
try:
    vr.validate( hp.parse_hgvs_variant('NM_000371.4:c.220G>C') )
except hgvs.exceptions.HGVSError as e:
    print(e)


In [None]:
hgvs_list = ['NC_000017.11:g.43091687delC', 'NC_000007.13.g.21726874G>A']

try:
    parser = hgvs.parser.Parser()
    validator = hgvs.validator.IntrinsicValidator()
    
    for hgvs in hgvs_list:
        variant = parser.parse_hgvs_variant(hgvs)
        validator.validate(variant)
except Exception as e:
    print(e)

import hgvs.validator
import hgvs.exceptions
vr = hgvs.validator.Validator(hdp=hdp)
try:
    vr.validate( hp.parse_hgvs_variant('NM_001267550.2(TTN):c.80006G>A') )
except hgvs.exceptions.HGVSError as e:
    print(e)


from hgvs.exceptions import HGVSError
import hgvs.parser
import hgvs.validator

hgvs_list = ('NC_000017.11:g.43091687delC', 'NC_000007.13.g.21726874G>A')

# Create a validator instance
vr = hgvs.validator.Validator(hdp=hdp)

# Create a parser instance
hp = hgvs.parser.Parser()

try:
    for hgvs in hgvs_list:
        vr.validate(hp.parse_hgvs_variant(hgvs))
        print('expression passed: {}'.format(hgvs))
except HGVSError as e:
    print(e)


In [14]:
import pandas as pd
import hgvs.parser
import hgvs.dataproviders.uta
import hgvs.validator
from hgvs.exceptions import HGVSError

# import sys
# sys.path.append('..')

hp = hgvs.parser.Parser()
hdp = hgvs.dataproviders.uta.connect()
vr = hgvs.validator.Validator(hdp=hdp)

excel_file = '/Users/M278428/Documents/rf_lab_projects/DraftCoreDataModel/data/finaltestdata.xlsx'

In [15]:
input_data = pd.read_excel(excel_file)

In [16]:
input_data

Unnamed: 0,hgvs_expression,human_curated,Failure Reason
0,"NM_001267550.2:c.80006G>A, p.(Ser26669Asn)",pass,
1,"NM_000059.4:c.5737T>C, p.(Cys1913Arg)",pass,
2,"NM_000371.4:c.220G>C, p.(Glu74Lys)",pass,
3,"NM_000371.4:c.424G>A, p.(Val142Ile)",pass,
4,"NM_000371.4:c.148G>A, p.(Val50Met)",pass,
...,...,...,...
96,NP_001120680.1:p.Asp1314Metfs*49;[Pro1530_Gln1...,fail,"no refseq, incorrect format for mosaic"
97,CALR c.1154insTTGTC (mosaic),fail,"no refseq, incorrect allelic formatting"
98,EZH2: c.2196A>G (homozygous),fail,mismatch between refseq and variant type (mito...
99,NC_012920.1:c.1555A>G,fail,incorrect punctuation (should use brackets for...


In [17]:
input_data['hgvs_expression'] = input_data['hgvs_expression'].str.strip()

In [20]:
hgvsExamples = input_data['hgvs_expression']
hgvsExamples

0             NM_001267550.2:c.80006G>A, p.(Ser26669Asn)
1                  NM_000059.4:c.5737T>C, p.(Cys1913Arg)
2                     NM_000371.4:c.220G>C, p.(Glu74Lys)
3                    NM_000371.4:c.424G>A, p.(Val142Ile)
4                     NM_000371.4:c.148G>A, p.(Val50Met)
                             ...                        
96     NP_001120680.1:p.Asp1314Metfs*49;[Pro1530_Gln1...
97                          CALR c.1154insTTGTC (mosaic)
98                          EZH2: c.2196A>G (homozygous)
99                                 NC_012920.1:c.1555A>G
100                         NC_000023:g.147912051CGG(33)
Name: hgvs_expression, Length: 101, dtype: object

In [22]:
def validate_hgvs_variants(hgvs_list):
    error_messages = []

    for hgvs in hgvs_list:
        try:
            parsed_variant = hp.parse_hgvs_variant(hgvs)
            vr.validate(parsed_variant)
            error_messages.append(True)
        except HGVSError as e:
            error_messages.append(e) 
    
    return error_messages

In [24]:
results = validate_hgvs_variants(hgvsExamples)
results

[hgvs.exceptions.HGVSParseError('NM_001267550.2:c.80006G>A, p.(Ser26669Asn): char 25: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_000059.4:c.5737T>C, p.(Cys1913Arg): char 21: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_000371.4:c.220G>C, p.(Glu74Lys): char 20: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_000371.4:c.424G>A, p.(Val142Ile): char 20: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_000371.4:c.148G>A, p.(Val50Met): char 20: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_000097.7:c.814A>C, p.(Asn272His): char 20: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_007194.4:c.470T>C, p.(Ile157Thr): char 20: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_001122764.3:c.767C>G, p.(Pro256Arg): char 23: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_004333.6:c.1799T>A, p.(Val600Glu): char 21: expected EOF'),
 hgvs.exceptions.HGVSParseError('NM_000077.5:c.335G>C, p.(Arg112Pro): char 20: expected EOF'),
 hgvs.exceptions.HGVSParseError("NM_000

In [27]:
hgvs_results = {
    'HGVS': hgvsExamples,
    'Validator': results
}
result = pd.DataFrame(hgvs_results) 
result

Unnamed: 0,HGVS,Validator
0,"NM_001267550.2:c.80006G>A, p.(Ser26669Asn)","NM_001267550.2:c.80006G>A, p.(Ser26669Asn): ch..."
1,"NM_000059.4:c.5737T>C, p.(Cys1913Arg)","NM_000059.4:c.5737T>C, p.(Cys1913Arg): char 21..."
2,"NM_000371.4:c.220G>C, p.(Glu74Lys)","NM_000371.4:c.220G>C, p.(Glu74Lys): char 20: e..."
3,"NM_000371.4:c.424G>A, p.(Val142Ile)","NM_000371.4:c.424G>A, p.(Val142Ile): char 20: ..."
4,"NM_000371.4:c.148G>A, p.(Val50Met)","NM_000371.4:c.148G>A, p.(Val50Met): char 20: e..."
...,...,...
96,NP_001120680.1:p.Asp1314Metfs*49;[Pro1530_Gln1...,NP_001120680.1:p.Asp1314Metfs*49;[Pro1530_Gln1...
97,CALR c.1154insTTGTC (mosaic),CALR c.1154insTTGTC (mosaic): char 4: expected...
98,EZH2: c.2196A>G (homozygous),EZH2: c.2196A>G (homozygous): char 5: expected...
99,NC_012920.1:c.1555A>G,Accession (NC_012920.1) is not compatible with...
