In [1]:
#RUN FOR GITHUB CODESPACE
import sys
import warnings

sys.path.append('..')
warnings.filterwarnings('ignore')

In [2]:
import json
from src.vrs.vrs_utils import VrsTranslate
from database.db_operation import DbOperation
from src.variant_to_cvc_translate import CVCTranslatorTo


In [3]:
# Creating an instance of the VrsTranslate class
vrs_translate = VrsTranslate()

# Creating an instance of the CVCTranslator class
cvc_translator_to = CVCTranslatorTo()

#Connect to gold standard database
db = DbOperation('../database/gsdb_v2.db')

#Convert database to pandas dataframe
df = db.get_combined_in_df()

### vrs_utils Module Overview

The `vrs_utils` module facilitates the translation of VRS expressions to SPDI and HGVS formats. It utilizes external APIs for translation and validation, along with the `vrs-python` translator module.

#### Features
- **Translation to HGVS**: Translates VRS expressions to HGVS using the `vrs-python` translator module.
- **Translation to SPDI**: Translates VRS expressions to SPDI using the `vrs-python` translator module.

#### Dependencies
- **External APIs**:
  - Biocommons SeqRepo API
- **Python Packages**:
  - vrs-python


### Capturing Example Data From Gold Standard Database

In [4]:
#Capturing the vrs data in the dataframe
vrs_data = db.extract_values(df,'vrs')[:3]

### Preforming translations from VRS to SPDI, HGVS, and CVC

In [5]:
# Translate a VRS object to a SPDI string 
for allele in vrs_data:
    print(f'VRS Expression:\n{json.dumps(allele, indent=2)}')
    print(f'Translated to SPDI: {vrs_translate.from_vrs_to_spdi(allele,validate=True)}\n')

VRS Expression:
{
  "_id": "ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 1014263
      },
      "end": {
        "type": "Number",
        "value": 1014265
      }
    }
  },
  "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "C"
  }
}
Translated to SPDI: NC_000001.11:1014263:CC:C

VRS Expression:
{
  "_id": "ga4gh:VA.5CfKpT5tErBj7PNtqdub7VOliwHEQLvs",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.z-j8mH9v1lJf-MsQosxg_8gtRA1zKhuE",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 15445654
      },
      "

In [6]:
# Translate a VRS dictionary to HGVS expression string 
for allele in vrs_data:
    print(f'VRS Expression:\n{json.dumps(allele, indent=2)}')
    print(f'Translated to HGVS: {vrs_translate.from_vrs_to_hgvs(allele)}\n') 

VRS Expression:
{
  "_id": "ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 1014263
      },
      "end": {
        "type": "Number",
        "value": 1014265
      }
    }
  },
  "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "C"
  }
}
Translated to HGVS: NC_000001.11:g.1014265del

VRS Expression:
{
  "_id": "ga4gh:VA.5CfKpT5tErBj7PNtqdub7VOliwHEQLvs",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.z-j8mH9v1lJf-MsQosxg_8gtRA1zKhuE",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 15445654
      },
      "

In [7]:
from ga4gh.vrs import models
#Translations form VRS to CVC to done using the variant_to_cvc_translate.py module.
for allele in vrs_data:
    print(f'VRS Expression:\n{json.dumps(allele, indent=2)}')
    print(f'Translated to CVC:\n{cvc_translator_to.vrs_to_cvc(models.Allele(**allele))}\n')


VRS Expression:
{
  "_id": "ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 1014263
      },
      "end": {
        "type": "Number",
        "value": 1014265
      }
    }
  },
  "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "C"
  }
}
Translated to CVC:
CoreVariantClass(0-based interbase,DNA,CC,C,1014263,1014265,None,None,None,None,None,NC_000001.11,{})

VRS Expression:
{
  "_id": "ga4gh:VA.5CfKpT5tErBj7PNtqdub7VOliwHEQLvs",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.z-j8mH9v1lJf-MsQosxg_8gtRA1zKhuE",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "sta