In [1]:
from clinvar_functions import *
import pandas as pd
import matplotlib.pyplot as plt
import pysam

In [82]:
# Path to ClinVar VCF file
vcf_file = "./data/clinvar.vcf"

# Open the VCF file
vcf_reader = pysam.VariantFile(vcf_file)

Fields: ['ALLELEID',
 'CLNDISDB',
 'CLNDN',
 'CLNHGVS',
 'CLNREVSTAT',
 'CLNSIG',
 'CLNSIGSCV',
 'CLNVC',
 'CLNVCSO',
 'CLNVI',
 'GENEINFO',
 'ORIGIN']

In [99]:
# Extract information from first 10 variants
variants = []
for record in vcf_reader.fetch():
    gene_info = record.info.get("GENEINFO", [""])
    if "ORC1" in gene_info:
        try:
            variant_info = {
                "CHROM": record.chrom,  # Chromosome
                "POS": record.pos,      # Position
                "ID": record.id,        # Variant ID (e.g., rs number)
                "REF": record.ref,      # Reference allele
                "ALT": ",".join(str(alt) for alt in record.alts),  # Alternate allele(s)
                "CLNREVSTAT": record.info.get("CLNREVSTAT", ["."])[0],  # Review status
                "CLNSIG": record.info.get("CLNSIG", ["."])[0],  # Clinical significance
                "RESULT": record.info.get("MC", ["."])[0],  # Molecular consequence
                "CLNDN": record.info.get("CLNDN", ["."])[0]  # Disease name
            }
            variants.append(variant_info)
        except Exception as e:
            print(f"Error processing record at {record.chrom}:{record.pos} - {e}")
    

# Convert to DataFrame
df = pd.DataFrame(variants)
print(df.shape)

(445, 9)


In [100]:
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,CLNREVSTAT,CLNSIG,RESULT,CLNDN
0,1,52369375,1173067,TGTACATCTCCCATATTGCAACACCTCCCTTTCCCAAAGATGTACC...,T,criteria_provided,Pathogenic,.,Meier-Gorlin_syndrome_1
1,1,52372950,297574,G,A,criteria_provided,Benign,SO:0001624|3_prime_UTR_variant,Meier-Gorlin_syndrome_1|not_provided
2,1,52372955,297575,C,G,criteria_provided,Uncertain_significance,SO:0001624|3_prime_UTR_variant,Meier-Gorlin_syndrome_1
3,1,52373087,297576,G,A,criteria_provided,Uncertain_significance,SO:0001624|3_prime_UTR_variant,Meier-Gorlin_syndrome_1
4,1,52373187,703217,G,A,criteria_provided,Conflicting_classifications_of_pathogenicity,SO:0001819|synonymous_variant,Meier-Gorlin_syndrome_1|not_provided|ORC1-rela...
...,...,...,...,...,...,...,...,...,...
440,16,31094731,884454,A,G,criteria_provided,Uncertain_significance,SO:0001623|5_prime_UTR_variant,Vitamin_K-dependent_clotting_factors
441,16,31094774,884455,C,T,criteria_provided,Uncertain_significance,SO:0001623|5_prime_UTR_variant,Vitamin_K-dependent_clotting_factors
442,16,31094889,318978,C,G,criteria_provided,Uncertain_significance,.,Vitamin_K-Dependent_Clotting_Factors
443,16,31096316,60671,A,AC,no_assertion_criteria_provided,drug_response,.,Warfarin_response


In [83]:
variants = []

for record in vcf_reader.fetch():
    try:
        gene_info = record.info.get("GENEINFO", [""])[0]
        if "ORC1" in gene_info:
            # Extract values safely, handling missing fields
                variant_info = {
                    "CHROM": record.chrom,  # Chromosome
                    "POS": record.pos,      # Position
                    "ID": record.id if record.id else ".",  # Variant ID
                    "REF": record.ref,      # Reference allele
                    "ALT": ",".join(map(str, record.alts)) if record.alts else ".",  # ALT alleles
                    "CLNREVSTAT": record.info.get("CLNREVSTAT", ["."])[0],  # Review status
                    "CLNSIG": record.info.get("CLNSIG", ["."])[0],  # Clinical significance
                    "RESULT": record.info.get("MC", ["."])[0],  # Molecular consequence
                    "CLNDN": record.info.get("CLNDN", ["."])[0],  # Disease name
                }
                variants.append(variant_info)
    except Exception as e:
        print(f"Error processing record at {record.chrom}:{record.pos} - {e}")

# Convert to DataFrame
df = pd.DataFrame(variants)

[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaroun

In [84]:
df.head()

In [85]:
df.shape

(0, 0)

In [35]:
def split_row(row):
    try:
        return row.split("|")[1]
    except:
        return row

In [36]:
df["RESULT"] = df["RESULT"].apply(lambda x: split_row(x))

In [37]:
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,CLNREVSTAT,CLNSIG,RESULT,CLNDN
0,1,52369375,1173067,TGTACATCTCCCATATTGCAACACCTCCCTTTCCCAAAGATGTACC...,T,criteria_provided,Pathogenic,,Meier-Gorlin_syndrome_1
1,1,52372950,297574,G,A,criteria_provided,Benign,3_prime_UTR_variant,Meier-Gorlin_syndrome_1|not_provided
2,1,52372955,297575,C,G,criteria_provided,Uncertain_significance,3_prime_UTR_variant,Meier-Gorlin_syndrome_1
3,1,52373087,297576,G,A,criteria_provided,Uncertain_significance,3_prime_UTR_variant,Meier-Gorlin_syndrome_1
4,1,52373187,703217,G,A,criteria_provided,Conflicting_classifications_of_pathogenicity,synonymous_variant,Meier-Gorlin_syndrome_1|not_provided|ORC1-rela...
...,...,...,...,...,...,...,...,...,...
440,16,31094731,884454,A,G,criteria_provided,Uncertain_significance,5_prime_UTR_variant,Vitamin_K-dependent_clotting_factors
441,16,31094774,884455,C,T,criteria_provided,Uncertain_significance,5_prime_UTR_variant,Vitamin_K-dependent_clotting_factors
442,16,31094889,318978,C,G,criteria_provided,Uncertain_significance,,Vitamin_K-Dependent_Clotting_Factors
443,16,31096316,60671,A,AC,no_assertion_criteria_provided,drug_response,,Warfarin_response
