In [1]:
from clinvar_functions import *
import pandas as pd
import matplotlib.pyplot as plt
import pysam

In [82]:
# Path to ClinVar VCF file
vcf_file = "./data/clinvar.vcf"

# Open the VCF file
vcf_reader = pysam.VariantFile(vcf_file)

Fields: ['ALLELEID',
 'CLNDISDB',
 'CLNDN',
 'CLNHGVS',
 'CLNREVSTAT',
 'CLNSIG',
 'CLNSIGSCV',
 'CLNVC',
 'CLNVCSO',
 'CLNVI',
 'GENEINFO',
 'ORIGIN']

In [122]:
# Extract information from first 10 variants
variants = []
for record in vcf_reader.fetch():
    gene_info = record.info.get("GENEINFO", [""])
    if "ORC1" in gene_info:
        try:
            variant_info = {
                "CHROM": record.chrom,  # Chromosome
                "POS": record.pos,      # Position
                "ID": record.id,        # Variant ID (e.g., rs number)
                "GENEINFO":gene_info,   # Gene information
                "REF": record.ref,      # Reference allele
                "ALT": ",".join(str(alt) for alt in record.alts),  # Alternate allele(s)
                "CLNREVSTAT": record.info.get("CLNREVSTAT"),  # Review status
                "CLNSIG": record.info.get("CLNSIG"),  # Clinical significance
                "RESULT": record.info.get("MC"),  # Molecular consequence
                "CLNDN": record.info.get("CLNDN")  # Disease name
            }
            variants.append(variant_info)
        except Exception as e:
            print(f"Error processing record at {record.chrom}:{record.pos}:{gene_info} - {e}")
    

# Convert to DataFrame
df = pd.DataFrame(variants)
print(df.shape)

Error processing record at 16:31092475:VKORC1:79001 - 'NoneType' object is not iterable
Error processing record at 16:31096368:VKORC1:79001 - 'NoneType' object is not iterable
(445, 10)


In [125]:
# List of genes after filtration
set(df['GENEINFO'])

{'MORC1:27136',
 'ORC1:4998',
 'ORC1:4998|LOC126805733:126805733',
 'ORC1:4998|PRPF38A:84950',
 'VKORC1:79001',
 'VKORC1:79001|PRSS53:339105',
 'VKORC1L1:154807'}

In [143]:
# Filterded by gene name
df = df[(df['GENEINFO']=='ORC1:4998|PRPF38A:84950') | (df['GENEINFO']=='ORC1:4998') | (df['GENEINFO']=='ORC1:4998|LOC126805733:126805733')]
print(df.shape)

(339, 10)


In [178]:
# Functions for data cleaning
def convert_to_str(x):
    if x is not None:
        return x[0]
    return 'not_specified'

def second_element(x):
    li = list(x)
    if len(li) == 2:
        return li[1][1:]
    return 'not_specified'

def split_row(row):
    try:
        return row.split("|")[1]
    except:
        return row

In [None]:
df["CLNSIG"] = df["CLNSIG"].apply(lambda x: convert_to_str(x))
df["RESULT"] = df["RESULT"].apply(lambda x: convert_to_str(x))
df["CLNDN"] = df["CLNDN"].apply(lambda x: convert_to_str(x))

df["CRITERIA"] = df["CLNREVSTAT"].apply(lambda x: list(x)[0])
df["SUBMITTION"] = df["CLNREVSTAT"].apply(lambda x: second_element(x))
df["RESULT"] = df["RESULT"].apply(lambda x: split_row(x))

#df = df.drop(columns=['CLNREVSTAT'])

In [183]:
df

Unnamed: 0,CHROM,POS,ID,GENEINFO,REF,ALT,CLNSIG,RESULT,CLNDN,CRITERIA,SUBMITTION
0,1,52369375,1173067,ORC1:4998,TGTACATCTCCCATATTGCAACACCTCCCTTTCCCAAAGATGTACC...,T,P,n,M,criteria_provided,single_submitter
1,1,52372950,297574,ORC1:4998,G,A,B,S,M,criteria_provided,not_specified
2,1,52372955,297575,ORC1:4998,C,G,U,S,M,criteria_provided,single_submitter
3,1,52373087,297576,ORC1:4998,G,A,U,S,M,criteria_provided,single_submitter
4,1,52373187,703217,ORC1:4998,G,A,C,S,M,criteria_provided,conflicting_classifications
...,...,...,...,...,...,...,...,...,...,...,...
334,1,52402240,875878,ORC1:4998,T,A,U,S,M,criteria_provided,single_submitter
335,1,52404418,875879,ORC1:4998|LOC126805733:126805733,G,A,U,S,M,criteria_provided,single_submitter
336,1,52404439,297590,ORC1:4998|LOC126805733:126805733,G,C,U,S,M,criteria_provided,single_submitter
337,1,52408585,3425738,ORC1:4998|PRPF38A:84950,G,C,U,S,n,criteria_provided,single_submitter


In [37]:
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,CLNREVSTAT,CLNSIG,RESULT,CLNDN
0,1,52369375,1173067,TGTACATCTCCCATATTGCAACACCTCCCTTTCCCAAAGATGTACC...,T,criteria_provided,Pathogenic,,Meier-Gorlin_syndrome_1
1,1,52372950,297574,G,A,criteria_provided,Benign,3_prime_UTR_variant,Meier-Gorlin_syndrome_1|not_provided
2,1,52372955,297575,C,G,criteria_provided,Uncertain_significance,3_prime_UTR_variant,Meier-Gorlin_syndrome_1
3,1,52373087,297576,G,A,criteria_provided,Uncertain_significance,3_prime_UTR_variant,Meier-Gorlin_syndrome_1
4,1,52373187,703217,G,A,criteria_provided,Conflicting_classifications_of_pathogenicity,synonymous_variant,Meier-Gorlin_syndrome_1|not_provided|ORC1-rela...
...,...,...,...,...,...,...,...,...,...
440,16,31094731,884454,A,G,criteria_provided,Uncertain_significance,5_prime_UTR_variant,Vitamin_K-dependent_clotting_factors
441,16,31094774,884455,C,T,criteria_provided,Uncertain_significance,5_prime_UTR_variant,Vitamin_K-dependent_clotting_factors
442,16,31094889,318978,C,G,criteria_provided,Uncertain_significance,,Vitamin_K-Dependent_Clotting_Factors
443,16,31096316,60671,A,AC,no_assertion_criteria_provided,drug_response,,Warfarin_response
