In [1]:
#RUN FOR GITHUB CODESPACE
import sys
import warnings

sys.path.append('..')
warnings.filterwarnings('ignore')


In [2]:
from database.db_operation import DbOperation

### Goldstandard Database Notebook

This notebook will demonstrate the usage of the Goldstandard database (GSDB). GSDB is composed of 3 tables with the following schema:

**Expression:**
- id
- variation_id 
- profile_id
- description
- value

**Profile:**
- id
- name
- version
- description

**Variation:**
- id
- xref
- description

A 4th view table is created called CombineData that is a virtual table of combinations of the three tables. It includes Profile.name, Profile.version, Variation.xref, Expression.value. This table will be represented as a pandas DataFrame and will be used for various translations shown below. 

**Note:** This database is in alpha stage and has the potential to change in schema and data.



#### Example Database

In [3]:
#Connect to gold standard database
db = DbOperation('../database/gsdb_v2.db')

#Convert database to pandas DataFrame
df = db.get_combined_in_df()

In [4]:
#A view of the database in a pandas DataFrame.
df.head()

Unnamed: 0,name,version,description,xref,value
0,cvc,0.1,Deletion,https://www.ncbi.nlm.nih.gov/clinvar/variation...,"{'origCoordSystem': '0-based interbase', 'seqT..."
1,spdi,0.1,Deletion,https://www.ncbi.nlm.nih.gov/clinvar/variation...,NC_000001.11:1014263:CC:C
2,hgvs,0.1,Deletion,https://www.ncbi.nlm.nih.gov/clinvar/variation...,NC_000001.11:g.1014265del
3,vrs,1.3,Deletion,https://www.ncbi.nlm.nih.gov/clinvar/variation...,{'_id': 'ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3J...
4,cvc,0.1,Deletion,https://www.ncbi.nlm.nih.gov/clinvar/variation...,"{'origCoordSystem': '0-based interbase', 'seqT..."


### cvc_to_variant_translate Module Overview

The `cvc_to_variant_translate` module offers functionality for translating variations from `CoreVariantClass` into HGVS, SPDI, or VRS formats.

#### Features

- **CoreVariantClass to HGVS Translation**: Translates CVC expressions into HGVS.

- **CoreVariantClass to SPDI Translation**: Translates CVC expressions into SPDI.

- **CoreVariantClass to VRS Translation**: Translates CVC expressions into VRS.

#### Dependencies
- **External APIs**:
  - Biocmmons SeqRepo API
  - NCBI Variation Services API

- **Python Packages**:
  - bioutils.normalize
  - hgvs

In [5]:
from src.core_variant import CoreVariantClass

# Import module to preform the translations from cvc to hgvs,spdi, and vrs.
from src.cvc_to_variant_translate import ToTranslate
cvc_to = ToTranslate()

In [6]:
#Bellow we will be demonstrating various method that will allow us to translate from teh CoreVarientClass to HGVS,SPDI, and VRS expressions.
# This Cell is showing an overview of us capturing one cvc example from the DataFrame and translating it into the various expressions. 
example = db.extract_values(df,'cvc')[:1]


for data in example:
    print(f'CVC Expression: {data}\n')
    print(f'Translated to HGVS: {cvc_to.cvc_to_hgvs(CoreVariantClass(**data))}\n')
    print(f'Translated to SPDI: {cvc_to.cvc_to_spdi(CoreVariantClass(**data))}\n')
    print(f'Translated to VRS: {cvc_to.cvc_to_vrs(CoreVariantClass(**data)).as_dict()}\n')

CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'CC', 'altAllele': 'C', 'start': 1014263, 'end': 1014265, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000001.11'}

Translated to HGVS: NC_000001.11:g.1014265del

Translated to SPDI: NC_000001.11:1014263:CC:C

Translated to VRS: {'_id': 'ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3', 'type': 'Allele', 'location': {'_id': 'ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6', 'type': 'SequenceLocation', 'sequence_id': 'ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO', 'interval': {'type': 'SequenceInterval', 'start': {'type': 'Number', 'value': 1014263}, 'end': {'type': 'Number', 'value': 1014265}}}, 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'C'}}



#### More Example Data

In [7]:
#Capturing the cvc data in the DataFrame
cvc_data = db.extract_values(df,'cvc')[:3]

In [8]:
#Translating the cvc object into a HGVS expression
for data in cvc_data:
    print(f'CVC Expression: {data}')
    print(f'Translated to HGVS: {cvc_to.cvc_to_hgvs(CoreVariantClass(**data))}\n')


CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'CC', 'altAllele': 'C', 'start': 1014263, 'end': 1014265, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000001.11'}
Translated to HGVS: NC_000001.11:g.1014265del

CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'GCATCG', 'altAllele': 'G', 'start': 15445654, 'end': 15445660, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000001.11'}
Translated to HGVS: NC_000001.11:g.15445656_15445660del

CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'AG', 'altAllele': '', 'start': 1510945, 'end': 1510947, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000016.10'}
Translated to HGVS: NC_000016.10:g.1510946_1510947del



In [9]:
#Translating the cvc object into a SPDI expression
for data in cvc_data:
    print(f'CVC Expression: {data}')
    print(f'Translated to SPDI: {cvc_to.cvc_to_spdi(CoreVariantClass(**data))}\n')


CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'CC', 'altAllele': 'C', 'start': 1014263, 'end': 1014265, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000001.11'}
Translated to SPDI: NC_000001.11:1014263:CC:C

CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'GCATCG', 'altAllele': 'G', 'start': 15445654, 'end': 15445660, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000001.11'}
Translated to SPDI: NC_000001.11:15445654:GCATCG:G

CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'AG', 'altAllele': '', 'start': 1510945, 'end': 1510947, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000016.10'}
Translated to SPDI: NC_000016.10:1510945:AG:



In [10]:
#Translating the cvc object into a VRS expression
for data in cvc_data:
    print(f'CVC Expression: {data}')
    print(f'Translated to VRS: {cvc_to.cvc_to_vrs(CoreVariantClass(**data)).as_dict()}\n')


CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'CC', 'altAllele': 'C', 'start': 1014263, 'end': 1014265, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000001.11'}
Translated to VRS: {'_id': 'ga4gh:VA.BmF3zr2l6XLpLaK8GInM6Q3Emc3JyPD3', 'type': 'Allele', 'location': {'_id': 'ga4gh:VSL.i6Of9s2jVDuJ4vwU6sCeG-jT7ygmlfx6', 'type': 'SequenceLocation', 'sequence_id': 'ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO', 'interval': {'type': 'SequenceInterval', 'start': {'type': 'Number', 'value': 1014263}, 'end': {'type': 'Number', 'value': 1014265}}}, 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'C'}}

CVC Expression: {'origCoordSystem': '0-based interbase', 'seqType': 'DNA', 'refAllele': 'GCATCG', 'altAllele': 'G', 'start': 15445654, 'end': 15445660, 'allelicState': None, 'geneSymbol': None, 'hgncId': None, 'chrom': None, 'genomeBuild': None, 'sequenceId': 'NC_000001.11'}
Translate