In [1]:
#RUN FOR GITHUB CODESPACE
import sys
import warnings

sys.path.append('..')
warnings.filterwarnings('ignore')

In [9]:
import json

from src.vrs.vrs_utils import VrsTranslate
from ga4gh.vrs import models

vrs_translate = VrsTranslate()

### vrs_utils Module Overview

The `vrs_utils` module facilitates the translation of VRS expressions to SPDI and HGVS formats. It utilizes external APIs for translation and validation and the vrs-python translator module.

#### Features
- **Translation to HGVS**: Translates VRS expressions to HGVS using the vrs-python translator module.
- **Translation to SPDI**: Translates VRS expressions to SPDI using the vrs-python translator module.

#### Dependencies
- **External APIs**:
  - Biocommons SeqRepo API
- **Python Packages**:
  - vrs-python


#### Example Data

In [3]:
from src.database.db_operation import DbOperation

#Connect to gold standard database
db = DbOperation('../src/database/gsdb.db')

#Convert database to pandas dataframe
df = db.get_combined_in_df()

#Capturing the cvc data in the dataframe
vrs_data = db.extract_values(df,'vrs')[:3]

vrs_objects = [models.Allele(**data) for data in vrs_data]
vrs_objects

[<Allele _id=<Literal<str> ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3> location=<SequenceLocation _id=<Literal<str> ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6> interval=<SequenceInterval end=<Number type=<Literal<str> Number> value=<Literal<int> 1014265>> start=<Number type=<Literal<str> Number> value=<Literal<int> 1014263>> type=<Literal<str> SequenceInterval>> sequence_id=<Literal<str> ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO> type=<Literal<str> SequenceLocation>> state=<LiteralSequenceExpression sequence=<Literal<str> C> type=<Literal<str> LiteralSequenceExpression>> type=<Literal<str> Allele>>,
 <Allele _id=<Literal<str> ga4gh:VA.5CfKpT5tErBj7PNtqdub7VOliwHEQLvs> location=<SequenceLocation _id=<Literal<str> ga4gh:VSL.z-j8mH9v1lJf-MsQosxg_8gtRA1zKhuE> interval=<SequenceInterval end=<Number type=<Literal<str> Number> value=<Literal<int> 15445660>> start=<Number type=<Literal<str> Number> value=<Literal<int> 15445654>> type=<Literal<str> SequenceInterval>> sequence_id=<Literal<str> ga

In [4]:
# Translate a VRS object to a SPDI string 
for allele in vrs_objects:
    print(f'VRS Expression:\n{json.dumps(allele.as_dict(), indent=2)}')
    print(f'Translated to SPDI: {vrs_translate.from_vrs_to_spdi(allele)}\n')

VRS Expression:
{
  "_id": "ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 1014263
      },
      "end": {
        "type": "Number",
        "value": 1014265
      }
    }
  },
  "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "C"
  }
}
Translated to SPDI: NC_000001.11:1014263:2:C

VRS Expression:
{
  "_id": "ga4gh:VA.5CfKpT5tErBj7PNtqdub7VOliwHEQLvs",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.z-j8mH9v1lJf-MsQosxg_8gtRA1zKhuE",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 15445654
      },
      "e

In [5]:
# Translate a VRS dictionary to HGVS expression string 
for allele in vrs_objects:
    print(f'VRS Expression:\n{json.dumps(allele.as_dict(), indent=2)}')
    print(f'Translated to HGVS: {vrs_translate.from_vrs_to_hgvs(allele)}\n') 

VRS Expression:
{
  "_id": "ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 1014263
      },
      "end": {
        "type": "Number",
        "value": 1014265
      }
    }
  },
  "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "C"
  }
}
Translated to HGVS: NC_000001.11:g.1014265del

VRS Expression:
{
  "_id": "ga4gh:VA.5CfKpT5tErBj7PNtqdub7VOliwHEQLvs",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.z-j8mH9v1lJf-MsQosxg_8gtRA1zKhuE",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 15445654
      },
      "

### variant_to_cvc_translate Module Overview

The `variant_to_cvc_translate` module offers functionality for translating variations from HGVS, SPDI, or VRS formats into a standardized representation known as `CoreVariantClass`.

#### Features
- **SPDI to CoreVariantClass Translation**: Translates SPDI expressions into CoreVariantClass objects.

- **HGVS to CoreVariantClass Translation**: Translates HGVS expressions into CoreVariantClass objects.

- **VRS to CoreVariantClass Translation**: Translates VRS expressions into CoreVariantClass objects.

#### Dependencies
- **External APIs**:
  - Biocmmons SeqRepo API
  - NCBI Variation Services API

- **Python Packages**:
  - bioutils.normalize
  - hgvs

In [6]:
from src.variant_to_cvc_translate import CVCTranslator
cvc_translator = CVCTranslator()

In [7]:
for allele in vrs_objects:
    print(f'VRS Expression:\n{json.dumps(allele.as_dict(), indent=2)}')
    print(f'Translated to CVC:\n{cvc_translator.vrs_to_cvc(allele)}\n')

VRS Expression:
{
  "_id": "ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "start": {
        "type": "Number",
        "value": 1014263
      },
      "end": {
        "type": "Number",
        "value": 1014265
      }
    }
  },
  "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "C"
  }
}
Translated to CVC:
CoreVariantClass(0-based interbase,DNA,CC,C,1014263,1014265,None,None,None,None,None,NC_000001.11,{})

VRS Expression:
{
  "_id": "ga4gh:VA.5CfKpT5tErBj7PNtqdub7VOliwHEQLvs",
  "type": "Allele",
  "location": {
    "_id": "ga4gh:VSL.z-j8mH9v1lJf-MsQosxg_8gtRA1zKhuE",
    "type": "SequenceLocation",
    "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
    "interval": {
      "type": "SequenceInterval",
      "sta