In [None]:
from pathlib import Path
import pandas as pd
import json
from ga4gh.vrs.models import Allele

from ga4gh.vrs.models import Allele
from normalizers.allele_normalizer import AlleleNormalizer    
from translators.vrs_to_fhir import VrsToFhirAlleleTranslator 


normalize = AlleleNormalizer()
vrs_translator = VrsToFhirAlleleTranslator()


### Demonstration Overview: ClinVar to FHIR Allele Profiles

This notebook shows how ClinVar variation data (in categorical form) can be translated into **FHIR Allele Profiles**.  

#### Key features of this notebook:
- **Filter the data**: Pull records where `"members"` include an `"Allele"`, then sampled 20k to work with.  
- **Build objects**: Convert those records into VRS `Allele` objects.  
- **Explore**: Summarize the types of variation states in the sample.  
- **Translate**: Use the full translator to convert VRS Alleles into FHIR Allele Profiles.  
- **Save results**: Store both successful translations and any errors for review.  

> *Note: If you would like access to the output files, please contact the code owners.*  

In [None]:
# Source file: clinvar_gks_variation_2025_09_28_v2_4_3.jsonl.gz
# Step 1: Extracted JSON records where "members" contains type "Allele"
# Step 2: Exported 20,000 examples to allele_objects_20k_v2.jsonl

df = pd.read_json("data/allele_objects_20k_v2.jsonl", lines=True)
len(df)

20000

In [5]:
# Convert all members to VRS Allele objects
def extract_alleles(records):
    alleles = []
    error_alleles = []

    for value in records:
        try:
            vo = value.get("members", [])[0]
            alleles.append(Allele(**vo))
        except Exception as e:
            error_alleles.append({"record": value, "error": str(e)})
    
    return alleles, error_alleles

records = df.to_dict(orient="records")
alleles, error_alleles = extract_alleles(records)

len(alleles)


20000

In [None]:
# Exploring one of the vrs alleles
alleles[0].model_dump(exclude_none=True)

# Just by exploring the fields here this is not the minimal VRS Allele object,
# there are other fields being translated, so I will not be using the minimal
# translator, we will use the full translator.

{'id': 'ga4gh:VA.RYY2yzWCjihuu2hmqFWu8qg7aUB3aPwR',
 'type': 'Allele',
 'name': 'NC_000002.12:g.121530895G>C',
 'digest': 'RYY2yzWCjihuu2hmqFWu8qg7aUB3aPwR',
 'expressions': [{'syntax': 'spdi', 'value': 'NC_000002.12:121530894:G:C'},
  {'syntax': 'hgvs.g', 'value': 'NC_000002.12:g.121530895G>C'},
  {'syntax': 'gnomad', 'value': '2-121530895-G-C'}],
 'location': {'id': 'ga4gh:SL.3sVPpjle__QNFe8yGfLEja0R42VIZIp8',
  'type': 'SequenceLocation',
  'digest': '3sVPpjle__QNFe8yGfLEja0R42VIZIp8',
  'sequenceReference': {'type': 'SequenceReference',
   'name': 'NC_000002.12',
   'extensions': [{'name': 'assembly', 'value': 'GRCh38'}],
   'refgetAccession': 'SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g',
   'residueAlphabet': 'na',
   'moleculeType': 'genomic'},
  'start': 121530894,
  'end': 121530895},
 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'C'}}

In [14]:
# Looking at the different variations that we have in the 20K samples

def count_variation_states(alleles):
    counts = {
        "ReferenceLengthExpression": 0,
        "LiteralSequenceExpression": 0,
        "Other": 0
    }

    for vo in alleles:
        vo = vo.model_dump(exclude_none=True)
        if isinstance(vo, dict):
            type_val = vo.get("state", {}).get("type", "")
            if "LiteralSequenceExpression" in type_val:
                counts["LiteralSequenceExpression"] += 1
            elif "ReferenceLengthExpression" in type_val:
                counts["ReferenceLengthExpression"] += 1
            else:
                counts["Other"] += 1
        else:
            counts["Other"] += 1

    return counts

counts = count_variation_states(alleles)
print(counts)


{'ReferenceLengthExpression': 1404, 'LiteralSequenceExpression': 18596, 'Other': 0}


In [None]:
# Translating vrs alleles into FHIR allele profiles and saving results
def translate_alleles_to_fhir(alleles, success_path, error_path):
    fhir_trans = []
    errors = []

    for i, vo in enumerate(alleles):
        try:
            ap = vrs_translator.translate_allele_to_fhir(vo)
            fhir_trans.append({
                "idx": i,
                "vrs_allele": vo.model_dump(exclude_none=True),
                "fhir_allele": ap.model_dump(exclude_none=True)
            })
        except Exception as e:
            errors.append({
                "idx": i,
                "vrs_allele": vo.model_dump(exclude_none=True),
                "error": str(e)
            })

    with success_path.open("w", encoding="utf-8") as f:
        for row in fhir_trans:
            f.write(json.dumps(row, ensure_ascii=False, default=str) + "\n")

    if errors:
        with error_path.open("w", encoding="utf-8") as f:
            for row in errors:
                f.write(json.dumps(row, ensure_ascii=False, default=str) + "\n")

    print(f"Translated: {len(fhir_trans)}")
    print(f"Errors: {len(errors)}")

    return fhir_trans, errors

success_path = Path("data/vrs2fhir_success_v2.jsonl")
error_path   = Path("data/vrs2fhir_errors_v2.jsonl")

fhir_trans, errors = translate_alleles_to_fhir(alleles, success_path, error_path)


Translated: 20000
Errors: 0


## Conclusion

In this demonstration, we translated ClinVar variation data (in categorical form) into **FHIR Allele Profiles** using the full VRS-to-FHIR translator.  

- **Input file**: `data/allele_objects_20k_v2.jsonl`  
- **Output file (successes)**: `data/vrs2fhir_success_v2.jsonl`  
- **Output file (errors)**: `data/vrs2fhir_errors_v2.jsonl`  

Below is a small sample (5 rows) from the output file for quick exploration.

In [17]:
import pandas as pd

# Load just 5 samples from the success file
df_out = pd.read_json("data/vrs2fhir_success_v2.jsonl", lines=True).head(5)

In [24]:
# idx: Row index for reference
# vrs_allele: The input, a cleaned VRS Allele object derived from the ClinVar categorical variant dataset
# fhir_allele: The output, the corresponding FHIR Allele Profile generated by the translator
df_out

Unnamed: 0,idx,vrs_allele,fhir_allele
0,0,{'id': 'ga4gh:VA.RYY2yzWCjihuu2hmqFWu8qg7aUB3a...,"{'resourceType': 'MolecularDefinition', 'conta..."
1,1,{'id': 'ga4gh:VA.g5c07moGbDIbB4aPClnfFh7h5QZdR...,"{'resourceType': 'MolecularDefinition', 'conta..."
2,2,{'id': 'ga4gh:VA.zPiIjXFLhRCSTL-W4sQla74xmA4o9...,"{'resourceType': 'MolecularDefinition', 'conta..."
3,3,{'id': 'ga4gh:VA.ZZN-G4SQCBxqt9B8m89W-xq33B-9I...,"{'resourceType': 'MolecularDefinition', 'conta..."
4,4,{'id': 'ga4gh:VA.hCuqIa4JtoD9ILwVQtK_WOpccO3T3...,"{'resourceType': 'MolecularDefinition', 'conta..."


In [23]:
example = df_out.iloc[0]

print("=== Example VRS Allele ===")
print(json.dumps(example["vrs_allele"], indent=2))

print("\n=== Example FHIR Allele ===")
print(json.dumps(example["fhir_allele"], indent=2))

=== Example VRS Allele ===
{
  "id": "ga4gh:VA.RYY2yzWCjihuu2hmqFWu8qg7aUB3aPwR",
  "type": "Allele",
  "name": "NC_000002.12:g.121530895G>C",
  "digest": "RYY2yzWCjihuu2hmqFWu8qg7aUB3aPwR",
  "expressions": [
    {
      "syntax": "spdi",
      "value": "NC_000002.12:121530894:G:C"
    },
    {
      "syntax": "hgvs.g",
      "value": "NC_000002.12:g.121530895G>C"
    },
    {
      "syntax": "gnomad",
      "value": "2-121530895-G-C"
    }
  ],
  "location": {
    "id": "ga4gh:SL.3sVPpjle__QNFe8yGfLEja0R42VIZIp8",
    "type": "SequenceLocation",
    "digest": "3sVPpjle__QNFe8yGfLEja0R42VIZIp8",
    "sequenceReference": {
      "type": "SequenceReference",
      "name": "NC_000002.12",
      "extensions": [
        {
          "name": "assembly",
          "value": "GRCh38"
        }
      ],
      "refgetAccession": "SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g",
      "residueAlphabet": "na",
      "moleculeType": "genomic"
    },
    "start": 121530894,
    "end": 121530895
  },
  "state": 