In [1]:

import json
import pandas as pd
from pathlib import Path
from typing import Optional
import sys
import os


from tools.clinical_db_fetcher.clients.clinvar_fetcher import (
    search_clinvar_by_hgvs_g,
    fetch_clinvar_details_by_id,
)


In [2]:
gene_symbol = 'FBN1'
model_name = 'all-mpnet-base-v2'
annotation_method = 'vep'

In [3]:
existing_variants_path = os.path.join(
    "data_user", "user_query", "results", 
    gene_symbol, f"{model_name}_{annotation_method}", "existing_variants.json"
)

with open(existing_variants_path, 'r') as f:
    existing_variant_ids = set(json.load(f).get('variants', []))


metadata_path = os.path.join(
    "data_user", "training_embedding_results", 
    "metadata", gene_symbol, "metadata.json"
)

with open(metadata_path, 'r') as f:
    metadata = json.load(f)
metadata_variants = metadata.get('variants')


In [4]:
df_metadata = pd.DataFrame(metadata_variants)

In [8]:
# Filter and select in one step (most efficient)
# df_result = df_metadata.loc[
#     df_metadata['variant_id'].isin(existing_variant_ids),
#     ['variant_id', 'hgvs_genomic_38', 'pathogenicity_original', ]
# ]

print("Method 4: Vectorized filtering (most efficient)")
df_existing_variants = df_metadata[df_metadata['variant_id'].isin(existing_variant_ids)]
df_existing_variants.drop(columns=['variant_id', 'variant_hash', 'vcf_string'], inplace=True)

print(df_existing_variants)
print()

# # If you want to preserve the order of existing_variant_ids:
# df_result_ordered = df_metadata.set_index('variant_id').loc[
#     existing_variant_ids,
#     ['pathogenicity_original', 'hgvs_genomic_38']
# ].reset_index()

# print("Method 4b: Preserving input order")
# print(df_result_ordered)


Method 4: Vectorized filtering (most efficient)
                 hgvs_genomic_38             hgvs_coding  hgvs_protein  \
33    NC_000015.10:g.48411280G>A   NM_000138.5:c.8326C>T  p.Arg2776Ter   
36    NC_000015.10:g.48411338C>T   NM_000138.5:c.8268G>A  p.Trp2756Ter   
54    NC_000015.10:g.48412715G>A   NM_000138.5:c.8080C>T  p.Arg2694Ter   
61    NC_000015.10:g.48415549G>A   NM_000138.5:c.8038C>T  p.Arg2680Cys   
70    NC_000015.10:g.48415585C>A   NM_000138.5:c.8002G>T  p.Gly2668Cys   
...                          ...                     ...           ...   
1442  NC_000015.10:g.48596360C>G    NM_000138.5:c.461G>C   p.Cys154Ser   
1462  NC_000015.10:g.48600213C>T    NM_000138.5:c.368G>A   p.Cys123Tyr   
1463  NC_000015.10:g.48600217G>A    NM_000138.5:c.364C>T   p.Arg122Cys   
1489  NC_000015.10:g.48613009C>T  NM_000138.5:c.247+1G>A                 
1498  NC_000015.10:g.48613073G>A    NM_000138.5:c.184C>T    p.Arg62Cys   

     chromosome  position ref_allele alt_allele gene_symbol  \


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_existing_variants.drop(columns=['variant_id', 'variant_hash', 'vcf_string'], inplace=True)


In [9]:
pred_result_df_path = f"data_user/user_query/processed/{gene_symbol}/{annotation_method}/prediction_results_k{5}_combined.csv"

pred_result_df = pd.read_csv(pred_result_df_path)
pred_result_df.drop(columns=['variant_id'], inplace=True)

In [11]:
pred_result_df = pd.concat([df_existing_variants, pred_result_df], axis=0, join='outer', ignore_index=True)
pred_result_df

Unnamed: 0,hgvs_genomic_38,hgvs_coding,hgvs_protein,chromosome,position,ref_allele,alt_allele,gene_symbol,most_severe_consequence,pathogenicity_original,confidence_score_all-mpnet-base-v2,pred_result_all-mpnet-base-v2,confidence_score_google-embedding,pred_result_google-embedding,confidence_score_MedEmbed-large-v0.1,pred_result_MedEmbed-large-v0.1,top_similar_variants,clinvar_id,clinvar_url,germline_classification
0,NC_000015.10:g.48411280G>A,NM_000138.5:c.8326C>T,p.Arg2776Ter,15,48411280,G,A,FBN1,stop_gained,pathogenic,,,,,,,,,,
1,NC_000015.10:g.48411338C>T,NM_000138.5:c.8268G>A,p.Trp2756Ter,15,48411338,C,T,FBN1,stop_gained,pathogenic,,,,,,,,,,
2,NC_000015.10:g.48412715G>A,NM_000138.5:c.8080C>T,p.Arg2694Ter,15,48412715,G,A,FBN1,stop_gained,pathogenic,,,,,,,,,,
3,NC_000015.10:g.48415549G>A,NM_000138.5:c.8038C>T,p.Arg2680Cys,15,48415549,G,A,FBN1,missense_variant,pathogenic_or_likely,,,,,,,,,,
4,NC_000015.10:g.48415585C>A,NM_000138.5:c.8002G>T,p.Gly2668Cys,15,48415585,C,A,FBN1,missense_variant,likely_pathogenic,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,NC_000015.10:g.48412639_48412640del,NM_000138.5:c.8155_8156del,p.Lys2719AspfsTer18,15,48412638,CTT,C,FBN1,frameshift_variant,unknown,0.8,likely_pathogenic,0.8,likely_pathogenic,1.0,potentially_benign,NC_000015.10:g.48411228T>C (likely_pathogenic)...,40243.0,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Pathogenic
652,NC_000015.10:g.48412573A>G,NM_000138.5:c.8222T>C,p.Ile2741Thr,15,48412573,A,G,FBN1,missense_variant,unknown,1.0,potentially_pathogenic,0.8,likely_pathogenic,1.0,potentially_pathogenic,NC_000015.10:g.48415573A>C (likely_pathogenic)...,2933163.0,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Conflicting classifications of pathogenicity
653,NC_000015.10:g.48412568C>A,NM_000138.5:c.8226+1G>T,,15,48412568,C,A,FBN1,splice_donor_variant,unknown,1.0,potentially_pathogenic,1.0,potentially_pathogenic,1.0,potentially_pathogenic,NC_000015.10:g.48412564C>T (pathogenic_or_like...,40245.0,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Pathogenic
654,NC_000015.10:g.48411369_48411370del,NM_000138.5:c.8238_8239del,p.Glu2746AspfsTer13,15,48411368,CTC,C,FBN1,frameshift_variant,unknown,0.8,likely_pathogenic,0.6,likely_pathogenic,1.0,potentially_benign,"NC_000015.10:g.48411383G>A (benign_or_likely),...",,,


In [12]:
pred_result_save_path = f"data_user/user_query/processed/{gene_symbol}/{annotation_method}/prediction_results_k{5}_combined_with_existing_variants.csv"
pred_result_df.to_csv(pred_result_save_path, index=False)

In [27]:
pred_result_path = "data_user/user_query/results/FBN1/all-mpnet-base-v2_vep/prediction_results.json"
pred_result = json.load(open(pred_result_path, 'r'))
pred_result

{'gene_symbol': 'FBN1',
 'successful': {'results_count': 348,
  'results': [{'variant_id': '15-48644711-T-C',
    'metadata': {'vcf_string': '15\t48644711\t.\tT\tC\t.\t.\t.',
     'chromosome': '15',
     'position': '48644711',
     'ref_allele': 'T',
     'alt_allele': 'C',
     'gene_symbol': 'FBN1',
     'hgvs_coding': 'NM_000138.5:c.59A>G',
     'hgvs_genomic_38': 'NC_000015.10:g.48644711T>C',
     'hgvs_protein': 'p.Tyr20Cys',
     'most_severe_consequence': 'missense_variant'},
    'annotation_raw': '{"strand": 1, "allele_string": "T/C", "assembly_name": "GRCh38", "vcf_string": "15-48644711-T-C", "input": "NC_000015.10:g.48644711T>C", "seq_region_name": "15", "minimised": 1, "colocated_variants": [{"start": 48644711, "seq_region_name": "15", "strand": 1, "allele_string": "HGMD_MUTATION", "end": 48644711, "phenotype_or_disease": 1, "id": "CM054704"}, {"frequencies": {"C": {"gnomadg_ami": 0, "eas": 0, "gnomadg_asj": 0, "gnomadg_remaining": 0, "gnomade_remaining": 0.0001987, "gnoma

In [33]:
pred_result_all = pred_result['successful'].get('results', [])

In [41]:
k = 5
for result in pred_result_all:
    pred_result = result['prediction_result'][str(k)]['pred_result']
    confidence_score = result['prediction_result'][str(k)]['confidence_score']
    top_k_result = result['nearest_training_variants'][:k]
    top_k_variant_ids = [nv['variant_id'] for nv in top_k_result]
    break


In [43]:
confidence_score

1.0

In [42]:
top_k_variant_ids

['15-48610775-C-T',
 '15-48534131-A-C',
 '15-48600174-C-T',
 '15-48600147-C-T',
 '15-48526207-C-T']

In [None]:
pred_result

In [26]:
def parse_prediction_results(json_file_path: str, 
                             df_all_training_variants: pd.DataFrame,
                             k: Optional[int] = None,
                             model_name: Optional[str] = None) -> pd.DataFrame:
    """
    Parse prediction_results.json and generate a DataFrame with specified columns.
    
    Args:
        json_file_path: Path to the prediction_results.json file
        df_all_training_variants: DataFrame containing all training variants with 
                                  variant_id and hgvs_genomic_38 columns
        k: Optional k value to select specific prediction result and top k neighbors.
           If provided, uses prediction_result[str(k)] for pred_result and confidence_score,
           and selects top k neighbors from nearest_training_variants.
           If None, uses the first available k value or falls back to old structure.
        model_name: Optional model name to include in column names (e.g., 'all-mpnet-base-v2').
                    If provided, confidence_score column will be named 'confidence_score {model_name}'.
        
    Returns:
        DataFrame with columns: vcf_string, chromosome, position, ref_allele, 
        alt_allele, gene_symbol, hgvs_coding, hgvs_genomic_38, protein_change, 
        most_severe_consequence, confidence_score, pred_result, top_similar_variants
    """
    # Read JSON file
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Handle new structure with "successful" wrapper
    if 'successful' in data:
        results = data['successful'].get('results', [])
    else:
        # Fall back to old structure
        results = data.get('results', [])
    
    if not results:
        print(f"Warning: No results found in {json_file_path}")
        return pd.DataFrame()
    
    # Create a mapping from variant_id to hgvs_genomic_38 from training variants
    # Check if variant_id column exists in df_all_training_variants
    if 'variant_id' not in df_all_training_variants.columns:
        raise ValueError("df_all_training_variants must contain 'variant_id' column")
    if 'hgvs_genomic_38' not in df_all_training_variants.columns:
        raise ValueError("df_all_training_variants must contain 'hgvs_genomic_38' column")
    
    # Create variant_id to hgvs_genomic_38 mapping
    variant_id_to_hgvs = dict(zip(
        df_all_training_variants['variant_id'], 
        df_all_training_variants['hgvs_genomic_38']
    ))
    
    # Parse each result
    parsed_results = []
    for result in results:
        metadata = result.get('metadata', {})
        
        # Extract nearest_training_variants
        nearest_variants = result.get('nearest_training_variants', [])
        
        # If k is specified, select top k neighbors
        if k is not None:
            nearest_variants = nearest_variants[:k]
        
        # Extract variant_ids from nearest_training_variants
        variant_ids = [nv.get('variant_id') for nv in nearest_variants if 'variant_id' in nv]
        
        # Map variant_ids to hgvs_genomic_38 annotations
        top_similar_variants = [
            variant_id_to_hgvs.get(vid) 
            for vid in variant_ids 
            if vid in variant_id_to_hgvs and pd.notna(variant_id_to_hgvs.get(vid))
        ]
        
        # Get prediction result and confidence score
        # Handle new structure with prediction_result dictionary
        prediction_result = result.get('prediction_result', {})
        if prediction_result and isinstance(prediction_result, dict):
            # If k is specified, use the corresponding k value
            if k is not None:
                k_str = str(k)
                if k_str in prediction_result:
                    pred_result = prediction_result[k_str].get('pred_result', '')
                    confidence_score = prediction_result[k_str].get('confidence_score', '')
                else:
                    print(f"Warning: k={k} not found in prediction_result, using first available")
                    # Use first available k value
                    first_k = list(prediction_result.keys())[0] if prediction_result else None
                    if first_k:
                        pred_result = prediction_result[first_k].get('pred_result', '')
                        confidence_score = prediction_result[first_k].get('confidence_score', '')
                    else:
                        pred_result = ''
                        confidence_score = ''
            else:
                # Use first available k value if k not specified
                first_k = list(prediction_result.keys())[0] if prediction_result else None
                if first_k:
                    pred_result = prediction_result[first_k].get('pred_result', '')
                    confidence_score = prediction_result[first_k].get('confidence_score', '')
                else:
                    pred_result = ''
                    confidence_score = ''
        else:
            # Fall back to old structure (direct fields)
            pred_result = result.get('pred_result', '')
            confidence_score = result.get('confidence_score', '')
        
        # Build the row data
        # Determine column names based on model_name
        confidence_col = f'confidence_score {model_name}' if model_name else 'confidence_score'
        pred_result_col = f'pred_result {model_name}' if model_name else 'pred_result'
        similar_variants_col = f'top_similar_variants {model_name}' if model_name else 'top_similar_variants'
        
        row = {
            # 'vcf_string': metadata.get('vcf_string', ''),
            'chromosome': metadata.get('chromosome', ''),
            'position': metadata.get('position', ''),
            'ref_allele': metadata.get('ref_allele', ''),
            'alt_allele': metadata.get('alt_allele', ''),
            'gene_symbol': metadata.get('gene_symbol', ''),
            'hgvs_coding': metadata.get('hgvs_coding', ''),
            'hgvs_genomic_38': metadata.get('hgvs_genomic_38', ''),
            'protein_change': metadata.get('protein_change', ''),
            'most_severe_consequence': metadata.get('most_severe_consequence', ''),
            confidence_col: confidence_score,
            pred_result_col: pred_result,
            similar_variants_col: top_similar_variants  # List of hgvs_genomic_38
        }
        parsed_results.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(parsed_results)
    
    k_info = f" (k={k})" if k is not None else ""
    print(f"Successfully parsed {len(df)} prediction results{k_info}")
    print(f"DataFrame shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    return df

In [None]:
df_predictions = parse_prediction_results(
    prediction_results_path, 
    df_training_variants, 
    k=k_value,
    model_name=model_name
)

[{'variant_id': '15-48408466-T-C',
  'variant_hash': '25ec24a26ce39b7f',
  'vcf_string': '15\t48408466\t.\tT\tC\t.\t.\t.',
  'hgvs_genomic_38': 'NC_000015.10:g.48408466T>C',
  'hgvs_coding': 'NM_000138.5:c.*2524A>G',
  'hgvs_protein': '',
  'chromosome': '15',
  'position': 48408466,
  'ref_allele': 'T',
  'alt_allele': 'C',
  'gene_symbol': 'FBN1',
  'most_severe_consequence': '3_prime_UTR_variant',
  'pathogenicity_original': 'benign_or_likely'},
 {'variant_id': '15-48408832-C-T',
  'variant_hash': '3c7a24079bb8b721',
  'vcf_string': '15\t48408832\t.\tC\tT\t.\t.\t.',
  'hgvs_genomic_38': 'NC_000015.10:g.48408832C>T',
  'hgvs_coding': 'NM_000138.5:c.*2158G>A',
  'hgvs_protein': '',
  'chromosome': '15',
  'position': 48408832,
  'ref_allele': 'C',
  'alt_allele': 'T',
  'gene_symbol': 'FBN1',
  'most_severe_consequence': '3_prime_UTR_variant',
  'pathogenicity_original': 'benign'},
 {'variant_id': '15-48408837-T-C',
  'variant_hash': 'e8ba082cac65aa92',
  'vcf_string': '15\t48408837\t

In [None]:
# Check data types of variant fields
variant_id_to_find = '15-48408466-T-C'
variant = next((v for v in metadata_variants if v.get('variant_id') == variant_id_to_find), None)

if variant:
    print("=== Variant Structure and Data Types ===")
    print(f"variant_id: {variant.get('variant_id')} (type: {type(variant.get('variant_id')).__name__})")
    print(f"pathogenicity_original: {variant.get('pathogenicity_original')} (type: {type(variant.get('pathogenicity_original')).__name__})")
    
    metadata = variant.get('metadata', {})
    print(f"\n=== Metadata Fields and Types ===")
    print(f"chromosome: {metadata.get('chromosome')} (type: {type(metadata.get('chromosome')).__name__})")
    print(f"position: {metadata.get('position')} (type: {type(metadata.get('position')).__name__})")
    print(f"ref_allele: {metadata.get('ref_allele')} (type: {type(metadata.get('ref_allele')).__name__})")
    print(f"alt_allele: {metadata.get('alt_allele')} (type: {type(metadata.get('alt_allele')).__name__})")
    print(f"gene_symbol: {metadata.get('gene_symbol')} (type: {type(metadata.get('gene_symbol')).__name__})")
    print(f"hgvs_protein: '{metadata.get('hgvs_protein')}' (type: {type(metadata.get('hgvs_protein')).__name__})")
    
    # Example: Accessing with type awareness
    chromosome = metadata.get('chromosome')  # Returns string '15'
    position = metadata.get('position')      # Returns int 48408466
    
    # Type conversions if needed
    chromosome_int = int(chromosome) if chromosome else None
    position_str = str(position) if position else None
    
    print(f"\n=== Type Conversions ===")
    print(f"chromosome as int: {chromosome_int} (type: {type(chromosome_int).__name__})")
    print(f"position as string: {position_str} (type: {type(position_str).__name__})")


In [None]:
# Method 1: Using a loop to find the variant
variant_id_to_find = '15-48408466-T-C'
chromosome = None

for variant in metadata_variants:
    if variant.get('variant_id') == variant_id_to_find:
        chromosome = variant.get('metadata', {}).get('chromosome')
        break

print(f"Chromosome: {chromosome}")

# Method 2: Using list comprehension (returns first match or None)
variant = next((v for v in metadata_variants if v.get('variant_id') == variant_id_to_find), None)
if variant:
    chromosome = variant.get('metadata', {}).get('chromosome')
    print(f"Chromosome: {chromosome}")

# Method 3: Create a dictionary for faster lookups (if you need to search multiple times)
variant_dict = {v.get('variant_id'): v for v in metadata_variants}
target_variant = variant_dict.get('15-48408466-T-C')
if target_variant:
    pathogenicity_original = target_variant.get('pathogenicity_original')
    chromosome = target_variant.get('metadata', {}).get('chromosome')
    print(f"Pathogenicity original: {pathogenicity_original}")
    print(f"Chromosome: {chromosome}")


Chromosome: 15
Chromosome: 15
Pathogenicity original: benign_or_likely
Chromosome: 15


In [None]:

def json_to_df(json_file_path: str, 
field_to_convert: str = 'variants',
drop_columns: Optional[list] = None) -> pd.DataFrame:
    """
    Convert existing variants JSON file to pandas DataFrame.
    
    Args:
        json_file_path: Path to the JSON file containing existing variants
        drop_columns: Optional list of column names to drop after conversion
        
    Returns:
        DataFrame with one row per variant
    """
    # Read JSON file
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Extract variants list
    variants = data.get(field_to_convert, [])
    
    if not variants:
        print(f"Warning: No variants found in {json_file_path}")
        return pd.DataFrame()
    
    # Convert to DataFrame
    df = pd.DataFrame(variants)
    
    # Add gene_symbol and count as columns if not already present
    if 'gene_symbol' in data and 'gene_symbol' not in df.columns:
        df['gene_symbol'] = data['gene_symbol']
    
    # Drop specified columns if they exist
    if drop_columns:
        columns_to_drop = [col for col in drop_columns if col in df.columns]
        if columns_to_drop:
            df = df.drop(columns=columns_to_drop)
            print(f"Dropped columns: {columns_to_drop}")
    
    print(f"Successfully converted {len(df)} variants to DataFrame")
    print(f"DataFrame shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    return df

In [20]:
metadata_variants

[{'variant_id': '15-48408466-T-C',
  'variant_hash': '25ec24a26ce39b7f',
  'vcf_string': '15\t48408466\t.\tT\tC\t.\t.\t.',
  'hgvs_genomic_38': 'NC_000015.10:g.48408466T>C',
  'hgvs_coding': 'NM_000138.5:c.*2524A>G',
  'hgvs_protein': '',
  'chromosome': '15',
  'position': 48408466,
  'ref_allele': 'T',
  'alt_allele': 'C',
  'gene_symbol': 'FBN1',
  'most_severe_consequence': '3_prime_UTR_variant',
  'pathogenicity_original': 'benign_or_likely'},
 {'variant_id': '15-48408832-C-T',
  'variant_hash': '3c7a24079bb8b721',
  'vcf_string': '15\t48408832\t.\tC\tT\t.\t.\t.',
  'hgvs_genomic_38': 'NC_000015.10:g.48408832C>T',
  'hgvs_coding': 'NM_000138.5:c.*2158G>A',
  'hgvs_protein': '',
  'chromosome': '15',
  'position': 48408832,
  'ref_allele': 'C',
  'alt_allele': 'T',
  'gene_symbol': 'FBN1',
  'most_severe_consequence': '3_prime_UTR_variant',
  'pathogenicity_original': 'benign'},
 {'variant_id': '15-48408837-T-C',
  'variant_hash': 'e8ba082cac65aa92',
  'vcf_string': '15\t48408837\t

Created lookup dictionary with 1532 variants

=== Debug: Inspecting variant structure ===
Variant keys: ['variant_id', 'variant_hash', 'vcf_string', 'hgvs_genomic_38', 'hgvs_coding', 'hgvs_protein', 'chromosome', 'position', 'ref_allele', 'alt_allele', 'gene_symbol', 'most_severe_consequence', 'pathogenicity_original']
Full variant structure:
{
  "variant_id": "15-48411280-G-A",
  "variant_hash": "993c84e847fe446a",
  "vcf_string": "15\t48411280\t.\tG\tA\t.\t.\t.",
  "hgvs_genomic_38": "NC_000015.10:g.48411280G>A",
  "hgvs_coding": "NM_000138.5:c.8326C>T",
  "hgvs_protein": "p.Arg2776Ter",
  "chromosome": "15",
  "position": 48411280,
  "ref_allele": "G",
  "alt_allele": "A",
  "gene_symbol": "FBN1",
  "most_severe_consequence": "stop_gained",
  "pathogenicity_original": "pathogenic"
}

=== Found 3 variants ===

Variant ID: 15-48411280-G-A
  chromosome: 15 (type: str)
  position: 48411280 (type: int)
  ref_allele: G (type: str)
  alt_allele: A (type: str)
  pathogenicity_original: path

In [19]:
# Alternative: More compact function for batch queries
def get_variants_metadata(variant_ids: list, variant_dict: dict) -> list:
    """
    Retrieve metadata for multiple variant IDs.
    
    Args:
        variant_ids: List of variant IDs to query
        variant_dict: Dictionary mapping variant_id to variant data
        
    Returns:
        List of dictionaries containing variant metadata
    """
    results = []
    for variant_id in variant_ids:
        variant = variant_dict.get(variant_id)
        if variant:
            metadata = variant.get('metadata', {})
            # Flatten the structure: combine top-level and metadata fields
            result = {
                'variant_id': variant_id,
                'pathogenicity_original': variant.get('pathogenicity_original'),
                **metadata  # Unpack all metadata fields
            }
            results.append(result)
        else:
            results.append({'variant_id': variant_id, 'found': False})
    return results

# Usage example
variant_ids_to_find = [
    '15-48411280-G-A',
    '15-48411338-C-T',
    '15-48412715-G-A',
]

# Get all metadata at once
variants_metadata = get_variants_metadata(variant_ids_to_find, variant_dict)

# Convert to DataFrame for easier analysis
df_variants = pd.DataFrame(variants_metadata)
print(f"Retrieved {len(df_variants)} variants")
print(f"\nDataFrame info:")
print(df_variants.info())
print(f"\nFirst few rows:")
print(df_variants.head())

# Data types summary
print(f"\n=== Data Types Summary ===")
for col in df_variants.columns:
    if df_variants[col].notna().any():
        sample_value = df_variants[col].dropna().iloc[0]
        print(f"{col}: {type(sample_value).__name__} (example: {sample_value})")


Retrieved 3 variants

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   variant_id              3 non-null      object
 1   pathogenicity_original  3 non-null      object
dtypes: object(2)
memory usage: 180.0+ bytes
None

First few rows:
        variant_id pathogenicity_original
0  15-48411280-G-A             pathogenic
1  15-48411338-C-T             pathogenic
2  15-48412715-G-A             pathogenic

=== Data Types Summary ===
variant_id: str (example: 15-48411280-G-A)
pathogenicity_original: str (example: pathogenic)


In [None]:
pred_result = 