In [1]:
# import sys
# sys.path.append('..')

In [2]:
# Import SPDI class 
from src.spdi.spdi_class import SPDI

# SPDI translation method
from src.spdi.spdi_utils import SPDITranslate
spdiTranslate = SPDITranslate()



#### Creating a SPDI Expression
* spdi_class.py is able to create SPDI objects with validation steps that check all 4 attributes.

In [3]:
#Example Data that we will be using to demonstrate some of the functionality of spdi_class.py. 
# spdiList = ['NM_001267550.2:80230:G:A','NM_001256850.1:1365:G:A','NC_012920.1:1554:A:G','NM_000097.7:920:A:C']
# spdiList = ['NC_000007.14:55181230::GGCT', 'NC_000019.10:44908821:C:T', 'NC_000007.14:55181219:T:', 'NC_000023.11:32386322:T:GA','NC_000013.11:32936731:C:C', 'NC_000013.11:19993837:GT:GTGT']

spdiList = [
    {'sequence': 'NC_000007.14', 'position': '55181230', 'deletion': '', 'insertion': 'GGCT'},
    {'sequence': 'NC_000019.10', 'position': '44908821', 'deletion': 'C', 'insertion': 'T'},
    {'sequence': 'NC_000007.14', 'position': '55181219', 'deletion': 'T', 'insertion': ''},
    {'sequence': 'NC_000023.11', 'position': '32386322', 'deletion': 'T', 'insertion': 'GA'},
    {'sequence': 'NC_000013.11', 'position': '32936731', 'deletion': 'C', 'insertion': 'C'},
    {'sequence': 'NC_000013.11', 'position': '19993837', 'deletion': 'GT', 'insertion': 'GTGT'}
    ]

spdiObj= []
for spdi in spdiList: 
    spdiObj.append(SPDI(**spdi))

#### Methods inside of the spdi_class.py that is able to convert a SPDI object to a string and dictionary.

In [4]:
# Converting the SPDI object to a string using SPDI class method: to_string()
# The string format is: sequence:position:deletion:insertion
spdiExamples = []
for spdi in spdiObj:
    spdiExamples.append(spdi.to_string())
print(spdiExamples)

['NC_000007.14:55181230::GGCT', 'NC_000019.10:44908821:C:T', 'NC_000007.14:55181219:T:', 'NC_000023.11:32386322:T:GA', 'NC_000013.11:32936731:C:C', 'NC_000013.11:19993837:GT:GTGT']


In [5]:
# Taking a SPDI object and converting it to a SPDI dictionary
for spdi in spdiObj:
    print(spdi.to_dict())

{'sequence': 'NC_000007.14', 'position': '55181230', 'deletion': '', 'insertion': 'GGCT'}
{'sequence': 'NC_000019.10', 'position': '44908821', 'deletion': 'C', 'insertion': 'T'}
{'sequence': 'NC_000007.14', 'position': '55181219', 'deletion': 'T', 'insertion': ''}
{'sequence': 'NC_000023.11', 'position': '32386322', 'deletion': 'T', 'insertion': 'GA'}
{'sequence': 'NC_000013.11', 'position': '32936731', 'deletion': 'C', 'insertion': 'C'}
{'sequence': 'NC_000013.11', 'position': '19993837', 'deletion': 'GT', 'insertion': 'GTGT'}


#### SPDI Translation
* spdi_utils.py is designed to translate SPDI expression to HGVS expressions and VRS expressions.
* This module is able to preform these translations by utilizing outside resources such as the vrs-python translator module and the NCBI Variation Services API.

In [6]:
# Taking a SPDI string and converting it to a rightshift HGVS expression
for spdi in spdiExamples:
    print(spdiTranslate.from_spdi_to_rightshift_hgvs(spdi))

NC_000007.14:g.55181230_55181231insGGCT
NC_000019.10:g.44908822C>T
NC_000007.14:g.55181220del
NC_000023.11:g.32386323delinsGA
NC_000013.11:g.32936732=
NC_000013.11:g.19993838_19993839dup


In [7]:
# Preforming the same operation as above but this time we are showing the extra features that are available.
# Validate attribute is used to validate the spdi expression using the NCBI API. 
#TODO: not sure if we should implement this, because its not very good validator compared to what i have created.
#TODO: Discuss with Bob

# Output format is used to specify the output format of the HGVS expression.
# The default output format is 'string', which returns a string. the other option is 'parse', which uses the hgvs package from biocommons to return a hgvs parse object.
# The other option is 'string', which returns a string.
spdi_expression_1 = SPDI(sequence = 'NC_000007.14', position = '55181230', deletion = '', insertion = 'GGCT').to_string()

hgvsExpression = spdiTranslate.from_spdi_to_rightshift_hgvs(spdi_expression_1,validate = True,output_format='parse')

# hgvsExpression = spdiTranslate.from_spdi_to_rightshift_hgvs(spdiExample1,validate = False,output_format='string')

hgvsExpression

SequenceVariant(ac=NC_000007.14, type=g, posedit=55181230_55181231insGGCT, gene=None)

In [8]:
# Converting the SPDI string into a vrs object.

# With validation and output_format arguments
# Output format is used to specify the output format of the VRS expression.
# The default output format is obj, which returns a vrs object. The other option is string and dictionary.

vrsExpression = spdiTranslate.from_spdi_to_vrs(spdi_expression_1,validate=True,output_format='obj')

# vrsExpression = spdiTranslate.from_spdi_to_vrs(spdiExample1,validate=True,output_format='dict')

# vrsExpression= spdiTranslate.from_spdi_to_vrs(spdiExample1,validate=True,output_format='json')

vrsExpression

<Allele _id=<Literal<str> ga4gh:VA.JKGCs07cFu2wlDydCAe2ea06jMFXyK56> location=<SequenceLocation _id=<Literal<str> ga4gh:VSL.SdvAZCNKh5kf6ClsiOOmw_88fbkFPTqG> interval=<SequenceInterval end=<Number type=<Literal<str> Number> value=<Literal<int> 55181230>> start=<Number type=<Literal<str> Number> value=<Literal<int> 55181230>> type=<Literal<str> SequenceInterval>> sequence_id=<Literal<str> ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul> type=<Literal<str> SequenceLocation>> state=<LiteralSequenceExpression sequence=<Literal<str> GGCT> type=<Literal<str> LiteralSequenceExpression>> type=<Literal<str> Allele>>

In [9]:
from src.core_variant_translate import CVCTranslator
cvcTranslator = CVCTranslator()

In [10]:
for spdi in spdiExamples: 
    print(cvcTranslator.spdi_to_cvc(spdi))

CoreVariantClass(0-based interbase,DNA,,GGCT,55181230,55181230,None,None,None,None,None,NC_000007.14,{})
CoreVariantClass(0-based interbase,DNA,C,T,44908821,44908822,None,None,None,None,None,NC_000019.10,{})
CoreVariantClass(0-based interbase,DNA,T,,55181219,55181220,None,None,None,None,None,NC_000007.14,{})
CoreVariantClass(0-based interbase,DNA,T,GA,32386322,32386323,None,None,None,None,None,NC_000023.11,{})
CoreVariantClass(0-based interbase,DNA,C,C,32936731,32936732,None,None,None,None,None,NC_000013.11,{})
CoreVariantClass(0-based interbase,DNA,GT,GTGT,19993837,19993839,None,None,None,None,None,NC_000013.11,{})


#### Additional SPDI expression that you don't typically see. 

In [11]:
# SPDI examples: 
ex1 = 'NC_000017.11:83129587:TTGWCACATGATTG:TTG'
ex2 = 'NC_000003.12:16894810:W:'

In [12]:
#One thing to point out that validation steps were adjusted in order to follow IUPAC nucleotide codes. 
# Citation:  https://www.bioinformatics.org/sms/iupac.html

#Cretating a SPDI object
ex1SPDI = SPDI(sequence='NC_000017.11',position='83129587',deletion='TTG',insertion='TTGWCACATGATTG')
ex1SPDI

spdi_example_1 = ex1SPDI.to_string()

In [13]:
hgvsTranslate_example1 = spdiTranslate.from_spdi_to_rightshift_hgvs(expression=spdi_example_1)
hgvsTranslate_example1

'NC_000017.11:g.83129591_83129601dup'

In [14]:
vrsTranslate_example1 = spdiTranslate.from_spdi_to_vrs(expression=spdi_example_1,output_format='dict')
vrsTranslate_example1

{'_id': 'ga4gh:VA.3sJk3nkezo47eeRB90p8e2JoDKGN5GQ-',
 'type': 'Allele',
 'location': {'_id': 'ga4gh:VSL.8eHx9-4dIsZIUD6Od9WU-a06qNJKi8ra',
  'type': 'SequenceLocation',
  'sequence_id': 'ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7',
  'interval': {'type': 'SequenceInterval',
   'start': {'type': 'Number', 'value': 83129587},
   'end': {'type': 'Number', 'value': 83129601}}},
 'state': {'type': 'LiteralSequenceExpression',
  'sequence': 'TTGWCACATGATTGWCACATGATTG'}}

In [15]:
#Error occurs in the validation but these are truly SPDI expressions
ex2SPDI = SPDI(sequence='NC_000003.12',position='16894810',deletion='',insertion='W')

spdi_example_2 = ex2SPDI.to_string()

In [16]:
hgvsTranslate_example2 = spdiTranslate.from_spdi_to_rightshift_hgvs(expression=spdi_example_2)
hgvsTranslate_example2

'NC_000003.12:g.16894811dup'

In [17]:
vrsTranslate_example2 = spdiTranslate.from_spdi_to_vrs(expression=spdi_example_2,output_format='dict')
vrsTranslate_example2

{'_id': 'ga4gh:VA.qzSUI3aF8bcV0T-IgQF9pgSXmb6YrYv7',
 'type': 'Allele',
 'location': {'_id': 'ga4gh:VSL.-gPFR8xJpy5aWeQS6BzfMQOUe2LsJeb-',
  'type': 'SequenceLocation',
  'sequence_id': 'ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX',
  'interval': {'type': 'SequenceInterval',
   'start': {'type': 'Number', 'value': 16894810},
   'end': {'type': 'Number', 'value': 16894811}}},
 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'WW'}}

In [18]:
from src.spdi.spdi_normalize import VocaNormalizeSpdi
vnormspdi = VocaNormalizeSpdi()


ex = {'sequence': 'NC_000007.14', 'position': '55181230', 'deletion': '', 'insertion': 'GGCT'}
ex = SPDI(**ex)


print(f'SPDI object Example: {vnormspdi.spdi_voca_normalize(ex)}')

ex2 = 'NC_000003.12:195783878:GGAAGTGT:GGAAGTGTTGGTGACATGAAGAGGGGTGGCATGACCTGTGGATACTGAGGAAGTGT' #'NC_000007.14:55181230::GGCT'
print(f'String Example: {vnormspdi.spdi_voca_normalize(ex2)}')


SPDI object Example: NC_000007.14:55181230::GGCT
String Example: NC_000003.12:195783878:GGAAGTGT:GGAAGTGTTGGTGACATGAAGAGGGGTGGCATGACCTGTGGATACTGAGGAAGTGT


In [19]:
from src.spdi.spdi_class import SPDI
from src.api.seqrepo_api import SeqRepoAPI
cn = SeqRepoAPI("https://services.genomicmedlab.org/seqrepo")
dp = cn.dp    
from bioutils.normalize import normalize, NormalizationMode
from ga4gh.vrs.dataproxy import SequenceProxy


def voca_normalize(expression: str) -> str:
    # capture the start and end position 
    start_pos = int(expression.position)
    try:
        ref = int(expression.deletion)
    except ValueError:
        ref = len(expression.deletion)

    end_pos = start_pos + ref

    # capture the full Sequence 
    sequence = SequenceProxy(dp, expression.sequence)
    # voca normalize using bioutils.normalize
    interval,new_allele = normalize(sequence,
                              interval=(start_pos, end_pos), 
                              alleles=(None,expression.insertion),
                              bounds=(0,len(sequence)),
                              mode = NormalizationMode.EXPAND,
                              )
    
    expression.position = interval[0]
    expression.deletion = new_allele[0]
    expression.insertion = new_allele[1]

    return expression


# expression = SPDI(sequence = 'NC_000007.14', position = '55181230', deletion = '', insertion = 'GGCT')
# expression = SPDI(sequence ='NC_000007.14',position= '55181219',deletion='T',insertion='')
# expression = SPDI(sequence='NC_000013.11',position='19993837',deletion = 'GT',insertion='GTGT')     #{'sequence': 'NC_000013.11', 'position': '19993837', 'deletion': 'GT', 'insertion': 'GTGT'}
# expression = SPDI(sequence='NC_000023.11',position = '32386322',deletion = 'T', insertion='GA')#    {'sequence': 'NC_000023.11', 'position': '32386322', 'deletion': 'T', 'insertion': 'GA'},
# expression = SPDI('NM_004006.2','5933','AAAAAAAA','AAAAAAA')
# 'NC_000003.12:195783878:GGAAGTGT:GGAAGTGTTGGTGACATGAAGAGGGGTGGCATGACCTGTGGATACTGAGGAAGTGT'
expression = SPDI('NC_000003.12','195783878','GGAAGTGT','GGAAGTGTTGGTGACATGAAGAGGGGTGGCATGACCTGTGGATACTGAGGAAGTGT')
test = voca_normalize(expression)
test.to_string()

'NC_000003.12:195783878:GGAAGTGT:GGAAGTGTTGGTGACATGAAGAGGGGTGGCATGACCTGTGGATACTGAGGAAGTGT'