In [1]:
import duckdb
from dotenv import load_dotenv
import os
import pandas as pd

load_dotenv()  # loads variables from .env
db_path = os.getenv("DB_PATH_sql")
mtrain = os.getenv("DATA_TRAIN_PATH")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Optional: prevent truncation of long strings
pd.set_option('display.max_colwidth', None)

In [2]:
c1 = duckdb.connect(db_path)

In [3]:
tables = c1.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()

for table in tables:
    table_name = table[0]
    print(f"\nTable: {table_name}")
    # Get column info
    columns = c1.execute(f"PRAGMA table_info({table_name});").fetchall()
    for col in columns:
        print(f"  - {col[1]} ({col[2]})")  # col[1] = column name, col[2] = type



Table: allele
  - AlleleID (BIGINT)
  - HgvsNotation (VARCHAR)
  - GeneID (BIGINT)
  - HGNCID (VARCHAR)
  - ClinicalSignificance (VARCHAR)
  - LastEvaluated (VARCHAR)
  - RS# (dbSNP) (BIGINT)
  - RCVAccession (VARCHAR)
  - PhenotypeIDS (VARCHAR)
  - PhenotypeList (VARCHAR)
  - Origin (VARCHAR)
  - ChromosomeAccession (VARCHAR)
  - Chromosome (VARCHAR)
  - ReviewStatus (VARCHAR)
  - NumberSubmitters (BIGINT)
  - TestedInGTR (VARCHAR)
  - OtherIDs (VARCHAR)
  - VariationID (BIGINT)
  - PositionVCF (BIGINT)
  - ReferenceAlleleVCF (VARCHAR)
  - AlternateAlleleVCF (VARCHAR)
  - GenesPerAlleleID (DOUBLE)
  - Category (VARCHAR)
  - MC (VARCHAR)

Table: cross_references
  - AlleleID (BIGINT)
  - Database (VARCHAR)
  - ID (VARCHAR)
  - LastUpdated (VARCHAR)

Table: gene
  - GeneID (BIGINT)
  - GeneSymbol (VARCHAR)
  - GeneName (VARCHAR)
  - GeneLevelDisease (VARCHAR)
  - TotalSubmissions (DOUBLE)
  - TotalAlleles (DOUBLE)
  - SubmissionsReportingThisGene (DOUBLE)
  - AllelesReportedPathogenicL

In [4]:
c1.execute("SELECT * from allele limit 5").fetchdf()

Unnamed: 0,AlleleID,HgvsNotation,GeneID,HGNCID,ClinicalSignificance,LastEvaluated,RS# (dbSNP),RCVAccession,PhenotypeIDS,PhenotypeList,Origin,ChromosomeAccession,Chromosome,ReviewStatus,NumberSubmitters,TestedInGTR,OtherIDs,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,GenesPerAlleleID,Category,MC
0,15043,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,HGNC:28986,Uncertain significance,"Jun 29, 2015",150829393,RCV000000014,"MONDO:MONDO:0033005,MedGen:C4551772,OMIM:251300,Orphanet:2065,Orphanet:83472",Galloway-Mowat syndrome 1,germline,NC_000015.10,15,no assertion criteria provided,1,N,"ClinGen:CA210674,UniProtKB:Q92610#VAR_064583,OMIM:613624.0001",4,84799209,G,A,1.0,within single gene,SO:0001583|missense_variant
1,15044,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,HGNC:26927,Pathogenic,"Mar 12, 2024",267606829,RCV000000015|RCV000578659|RCV001194045|RCV003390625,"MONDO:MONDO:0032624,MedGen:C4748791,OMIM:618241|MedGen:C3661900|MONDO:MONDO:0009723,MedGen:C2931891,OMIM:256000,Orphanet:506|","Mitochondrial complex 1 deficiency, nuclear type 19|not provided|Leigh syndrome|FOXRED1-related disorder",germline,NC_000011.10,11,"criteria provided, multiple submitters, no conflicts",6,N,"ClinGen:CA113792,OMIM:613622.0001",5,126275389,C,T,1.0,within single gene,"SO:0001587|nonsense,SO:0001619|non-coding_transcript_variant"
2,15045,NM_017547.4(FOXRED1):c.1289A>G (p.Asn430Ser),55572,HGNC:26927,Likely pathogenic,"Jun 06, 2024",267606830,RCV000000016,"MONDO:MONDO:0032624,MedGen:C4748791,OMIM:618241","Mitochondrial complex 1 deficiency, nuclear type 19",germline,NC_000011.10,11,"criteria provided, single submitter",2,N,"ClinGen:CA113794,UniProtKB:Q96CU9#VAR_064571,OMIM:613622.0002",6,126277517,A,G,1.0,within single gene,"SO:0001583|missense_variant,SO:0001619|non-coding_transcript_variant"
3,15046,NM_025152.3(NUBPL):c.166G>A (p.Gly56Arg),80224,HGNC:20278,Conflicting classifications of pathogenicity,"Nov 12, 2024",200401432,RCV000196589|RCV000622708|RCV001526454|RCV005055710,"MedGen:C3661900|MeSH:D030342,MedGen:C0950123|MONDO:MONDO:0032625,MedGen:C4748792,OMIM:618242|MedGen:CN169374","not provided|Inborn genetic diseases|Mitochondrial complex 1 deficiency, nuclear type 21|not specified",germline;paternal,NC_000014.9,14,"criteria provided, conflicting classifications",5,N,"OMIM:613621.0001,ClinGen:CA321015,UniProtKB:Q8TB37#VAR_064570,ClinVar:7",214885,31562125,G,A,1.0,within single gene,"SO:0001583|missense_variant,SO:0001619|non-coding_transcript_variant"
4,15048,NM_000410.4(HFE):c.845G>A (p.Cys282Tyr),3077,HGNC:4886,"Pathogenic/Pathogenic, low penetrance; other; risk factor","Mar 04, 2025",1800562,RCV000000019|RCV000210820|RCV000308358|RCV000414811|RCV001270034|RCV001731264|RCV000178096|RCV001248830|RCV002280089|RCV002512585|RCV003224084|RCV003493406,"MONDO:MONDO:0021001,MedGen:C3469186,OMIM:235200,Orphanet:139498,Orphanet:465508|MONDO:MONDO:0015356,MeSH:D009386,MedGen:C0027672,Orphanet:140162|MONDO:MONDO:0006507,MedGen:C0392514,OMIM:PS235200|Human Phenotype Ontology:HP:0010473,MedGen:C0151861;Human Phenotype Ontology:HP:0000992,Human Phenotype Ontology:HP:0005594,Human Phenotype Ontology:HP:0006831,Human Phenotype Ontology:HP:0007538,MONDO:MONDO:0005434,MedGen:C0349506|7 conditions|Human Phenotype Ontology:HP:0001638,MONDO:MONDO:0004994,MedGen:C0878544,Orphanet:167848|MedGen:C3661900|MedGen:C0018995||MeSH:D030342,MedGen:C0950123|6 conditions|MONDO:MONDO:0019257,MedGen:C0268060,Orphanet:79230",Hemochromatosis type 1|Hereditary cancer-predisposing syndrome|Hereditary hemochromatosis|Porphyrinuria;Cutaneous photosensitivity|7 conditions|Cardiomyopathy|not provided|Bronze diabetes|HFE-related disorder|Inborn genetic diseases|6 conditions|Juvenile hemochromatosis,biparental;germline;unknown,NC_000006.12,6,"criteria provided, multiple submitters, no conflicts",53,Y,"UniProtKB:Q30201#VAR_004398,OMIM:613609.0001,ClinGen:CA113795",9,26092913,G,A,1.0,within single gene,"SO:0001583|missense_variant,SO:0001627|intron_variant"


In [5]:
c1.execute("SELECT * from allele where alleleId = 15045 limit 1").fetchdf()

Unnamed: 0,AlleleID,HgvsNotation,GeneID,HGNCID,ClinicalSignificance,LastEvaluated,RS# (dbSNP),RCVAccession,PhenotypeIDS,PhenotypeList,Origin,ChromosomeAccession,Chromosome,ReviewStatus,NumberSubmitters,TestedInGTR,OtherIDs,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,GenesPerAlleleID,Category,MC
0,15045,NM_017547.4(FOXRED1):c.1289A>G (p.Asn430Ser),55572,HGNC:26927,Likely pathogenic,"Jun 06, 2024",267606830,RCV000000016,"MONDO:MONDO:0032624,MedGen:C4748791,OMIM:618241","Mitochondrial complex 1 deficiency, nuclear type 19",germline,NC_000011.10,11,"criteria provided, single submitter",2,N,"ClinGen:CA113794,UniProtKB:Q96CU9#VAR_064571,OMIM:613622.0002",6,126277517,A,G,1.0,within single gene,"SO:0001583|missense_variant,SO:0001619|non-coding_transcript_variant"


In [6]:
c1.execute("SELECT * from summary_of_conflicting_interpretations limit 6").fetchdf()


Unnamed: 0,VariationID,ClinvarPreferred,Submitter1,Submitter1Clinsig,Submitter1Reviewstatus,Submitter1Description,Submitter2,Submitter2Clinsig,Submitter2Reviewstatus,Submitter2Description,RankDiff,ConflictReported,VariantType,Submitter1Method,Submitter2Method
0,734130,NC_000007.14:g.156791884T>C,"Labcorp Genetics (formerly Invitae), Labcorp",Benign,"criteria provided, single submitter",Not provided,"PreventionGenetics, part of Exact Sciences",Likely benign,no assertion criteria provided,"This variant is classified as likely benign based on ACMG/AMP sequence variant interpretation guidelines (Richards et al. 2015 PMID: 25741868, with internal and published modifications).",1,no,single nucleotide variant,clinical testing,clinical testing
1,1344497,NC_012920.1(MT-CYB):m.235A>G,Mendelics,Benign,"criteria provided, single submitter","This variant is considered likely benign or benign based on one or more of the following: it is predicted to be benign by multiple in silico algorithms, and/or has population frequency not consistent with disease, and/or has normal protein function, and/or has lack of segregation with disease, and/or has been detected in co-occurrence with known pathogenic variant, and/or has lack of disease association in case-control studies, and/or is located in a region inconsistent with a known cause of pathogenicity. GnomAD 4.1.0 frequency 0.04900 homoplsmic/9 heteroplasmic","Department of Pediatrics, Division of Medical Genetics, Faculty of Medicine Ramathibodi Hospital, Mahidol University",Uncertain significance,no assertion criteria provided,Not provided,2,yes,single nucleotide variant,clinical testing,research
2,12321,NC_000008.11:g.11573132C>T,OMIM,Pathogenic,no assertion criteria provided,Not provided,H3Africa Consortium,Benign,"criteria provided, single submitter","While the frequency of the alternate allele in gnoMAD v2.0.2 is 0.146, its frequency in African populations is >5%. This suggests that previous classifications of this variant as pathogenic are in error.",4,yes,single nucleotide variant,literature only,research
3,12321,NC_000008.11:g.11573132C>T,OMIM,Pathogenic,no assertion criteria provided,Not provided,"PreventionGenetics, part of Exact Sciences",Benign,no assertion criteria provided,"This variant is classified as benign based on ACMG/AMP sequence variant interpretation guidelines (Richards et al. 2015 PMID: 25741868, with internal and published modifications).",4,yes,single nucleotide variant,literature only,clinical testing
4,12321,NC_000008.11:g.11573132C>T,OMIM,Pathogenic,no assertion criteria provided,Not provided,"Reproductive Health Research and Development, BGI Genomics",Benign,no assertion criteria provided,"NC_000008.11:g.11573132C>T has an allele frequency of 0.163 in African subpopulation in the gnomAD database, including 128 homozygous occurrences. This variant was annotated at the position 11,468,050 on chromsome 8. In the functional study by MIN6 beta-cells, this variant decreased luciferase expression as compared with control constructs without any insert (PMID: 19667185). This evidence suggests the variant to be classified as benign. ACMG/AMP criteria applied: BA1, BS2, PS3.",4,yes,single nucleotide variant,literature only,curation
5,14669,NC_000010.11:g.6072697C>A,OMIM,Pathogenic,no assertion criteria provided,Not provided,"PreventionGenetics, part of Exact Sciences",Likely benign,no assertion criteria provided,"This variant is classified as likely benign based on ACMG/AMP sequence variant interpretation guidelines (Richards et al. 2015 PMID: 25741868, with internal and published modifications).",3,yes,single nucleotide variant,literature only,clinical testing


# conflict

In [7]:
# var id 12321 , 14669
c1.execute("SELECT * from allele where VariationID = 12321").fetchdf()


Unnamed: 0,AlleleID,HgvsNotation,GeneID,HGNCID,ClinicalSignificance,LastEvaluated,RS# (dbSNP),RCVAccession,PhenotypeIDS,PhenotypeList,Origin,ChromosomeAccession,Chromosome,ReviewStatus,NumberSubmitters,TestedInGTR,OtherIDs,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,GenesPerAlleleID,Category,MC
0,27360,NC_000008.11:g.11573132C>T,640,HGNC:1057,Benign,"Oct 28, 2020",61199332,RCV000013114|RCV001777135|RCV003914831,"MONDO:MONDO:0013242,MedGen:C3150618,OMIM:613375,Orphanet:552|MedGen:CN169374|",Maturity-onset diabetes of the young type 11|not specified|BLK-related disorder,germline,NC_000008.11,8,"criteria provided, single submitter",4,N,"ClinGen:CA10602353,OMIM:191305.0003",12321,11573132,C,T,1.0,"asserted, but not computed",


# ORG

In [8]:

c1.execute("SELECT * from organization_summary limit 5").fetchdf()


Unnamed: 0,OrganizationName,OrganizationID,InstitutionType,NumberOfClinVarSubmissions,MaximumReviewStatus,CollectionMethods,ClinicalSignificanceCategoriesSubmitted,NumberOfSubmissionsFromClinicalTesting,NumberOfSubmissionsFromResearch,NumberOfSubmissionsFromLiteratureOnly,NumberOfSubmissionsFromCuration,NumberOfSubmissionsFromPhenotyping
0,OMIM; Johns Hopkins University,3,resource,36730,no assertion criteria provided,literature only,"Pathogenic, Likely pathogenic, Uncertain significance, Benign, drug response, other, Affects, association, protective, risk factor, not provided",0,0,36638,0,0
1,Baylor Genetics,1006,lab,51195,"criteria provided, single submitter","clinical testing, research","Pathogenic, Likely pathogenic, Uncertain significance, Likely benign, Benign",51167,29,0,0,0
2,Athena Diagnostics,1012,lab,21962,"criteria provided, single submitter",clinical testing,"Pathogenic, Likely pathogenic, Uncertain significance, Likely benign, Benign, Likely risk allele",21962,0,0,0,0
3,Sanford Medical Genetics Laboratory; Sanfordhealth,1018,lab,2,no assertion criteria provided,clinical testing,"Pathogenic, Uncertain significance",2,0,0,0,0
4,Greenwood Genetic Center Diagnostic Laboratories; Greenwood Genetic Center,1019,lab,3920,"criteria provided, single submitter","clinical testing, research","Pathogenic, Likely pathogenic, Uncertain significance, Likely benign, Benign",3910,10,0,0,0


# citation

In [9]:


c1.execute("SELECT * from var_citations where VariationID= 12321 ").fetchdf()


Unnamed: 0,AlleleID,VariationID,CitationSource,CitationID,OrganizationID
0,27360,12321,PubMed,19667185,13
1,27360,12321,PubMed,33116287,508299


# submission summary

In [10]:

c1.execute("SELECT * from submission_summary where VariationID= 12321 ").fetchdf()


Unnamed: 0,VariationID,ClinicalSignificance,Description,SubmittedPhenotypeInfo,ReportedPhenotypeInfo,ReviewStatus,CollectionMethod,OriginCounts,Submitter,SubmittedGeneSymbol,ExplanationOfInterpretation,ContributesToAggregateClassification
0,12321,Benign,"NC_000008.11:g.11573132C>T has an allele frequency of 0.163 in African subpopulation in the gnomAD database, including 128 homozygous occurrences. This variant was annotated at the position 11,468,050 on chromsome 8. In the functional study by MIN6 beta-cells, this variant decreased luciferase expression as compared with control constructs without any insert (PMID: 19667185). This evidence suggests the variant to be classified as benign. ACMG/AMP criteria applied: BA1, BS2, PS3.","Maturity-onset diabetes of the young, type 11",C3150618:Maturity-onset diabetes of the young type 11,no assertion criteria provided,curation,germline:na,"Reproductive Health Research and Development, BGI Genomics",BLK,-,no
1,12321,Benign,"This variant is classified as benign based on ACMG/AMP sequence variant interpretation guidelines (Richards et al. 2015 PMID: 25741868, with internal and published modifications).",BLK-related condition,na:BLK-related disorder,no assertion criteria provided,clinical testing,germline:na,"PreventionGenetics, part of Exact Sciences",BLK,-,no
2,12321,Benign,"While the frequency of the alternate allele in gnoMAD v2.0.2 is 0.146, its frequency in African populations is >5%. This suggests that previous classifications of this variant as pathogenic are in error.",not specified,CN169374:not specified,"criteria provided, single submitter",research,germline:na,H3Africa Consortium,BLK,-,yes
3,12321,Pathogenic,-,"MATURITY-ONSET DIABETES OF THE YOUNG, TYPE 11",C3150618:Maturity-onset diabetes of the young type 11,no assertion criteria provided,literature only,germline:na,OMIM,BLK,-,no


In [11]:

c1.execute("SELECT * from hgvs4variation where VariationID= 12321 ").fetchdf()


Unnamed: 0,VariationID,AlleleID,NucleotideExpression,NucleotideChange
0,12321,27360,NC_000008.11:g.11573132C>T,g.11573132C>T


In [12]:
c1.close()

# model train data

In [13]:
c2 = duckdb.connect(mtrain)

In [14]:
c2.execute("select * from allele limit 5").fetchdf()

Unnamed: 0,AlleleID,GeneID,ClinicalSignificance,has_MC_nonsense,has_MC_non-coding_transcript_variant,has_MC_missense_variant,has_MC_intron_variant,has_MC_5_prime_UTR_variant,has_MC_splice_donor_variant,has_MC_synonymous_variant,has_MC_splice_acceptor_variant,has_MC_initiator_codon_variant,has_MC_3_prime_UTR_variant,has_MC_no_sequence_alteration,has_MC_stop_lost,has_MC_genic_upstream_transcript_variant,has_MC_genic_downstream_transcript_variant,has_Origin_germline,has_Origin_biparental,has_Origin_unknown,has_Origin_maternal,has_Origin_paternal,has_Origin_inherited,has_Origin_de novo,has_Origin_not applicable,has_Origin_tested-inconclusive,has_Origin_uniparental,has_Origin_not-reported,has_VariantGeneRelation_within single gene,has_VariantGeneRelation_within multiple genes by overlap,"has_VariantGeneRelation_asserted, but not computed","has_VariantGeneRelation_near gene, upstream","has_VariantGeneRelation_near gene, downstream",has_VariantGeneRelation_not identified,ref_is_A,ref_is_T,ref_is_G,ref_is_C,alt_is_A,alt_is_T,alt_is_G,alt_is_C,chr_11,chr_6,chr_2,chr_20,chr_10,chr_16,chr_22,chr_15,chr_1,chr_7,chr_8,chr_14,chr_21,chr_5,chr_4,chr_19,chr_3,chr_17,chr_12,chr_18,chr_9,chr_13,chr_MT,chr_Y,chr_X,is_genomic,is_mitochondrial
0,15044,55572,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,15045,55572,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,15049,3077,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,15053,3077,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,15058,3077,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [15]:
c2.execute("describe").fetchall()

[('model_train_data',
  'main',
  'allele',
  ['AlleleID',
   'GeneID',
   'ClinicalSignificance',
   'has_MC_nonsense',
   'has_MC_non-coding_transcript_variant',
   'has_MC_missense_variant',
   'has_MC_intron_variant',
   'has_MC_5_prime_UTR_variant',
   'has_MC_splice_donor_variant',
   'has_MC_synonymous_variant',
   'has_MC_splice_acceptor_variant',
   'has_MC_initiator_codon_variant',
   'has_MC_3_prime_UTR_variant',
   'has_MC_no_sequence_alteration',
   'has_MC_stop_lost',
   'has_MC_genic_upstream_transcript_variant',
   'has_MC_genic_downstream_transcript_variant',
   'has_Origin_germline',
   'has_Origin_biparental',
   'has_Origin_unknown',
   'has_Origin_maternal',
   'has_Origin_paternal',
   'has_Origin_inherited',
   'has_Origin_de novo',
   'has_Origin_not applicable',
   'has_Origin_tested-inconclusive',
   'has_Origin_uniparental',
   'has_Origin_not-reported',
   'has_VariantGeneRelation_within single gene',
   'has_VariantGeneRelation_within multiple genes by over

In [16]:
c2.close()


In [None]:
# 15044,"nonsense,non-coding_transcript_variant",germline,C,T,11,within single gene,g,Pathogenic,0.999173104763031