In [1]:
from clinvar_functions import *
import pandas as pd
import matplotlib.pyplot as plt
import pysam

In [2]:
# Path to ClinVar VCF file
vcf_file = "./data/clinvar.vcf"

# Open the VCF file
vcf_reader = pysam.VariantFile(vcf_file)

Fields: ['ALLELEID',
 'CLNDISDB',
 'CLNDN',
 'CLNHGVS',
 'CLNREVSTAT',
 'CLNSIG',
 'CLNSIGSCV',
 'CLNVC',
 'CLNVCSO',
 'CLNVI',
 'GENEINFO',
 'ORIGIN']

In [3]:
# Extract information from first 10 variants
variants = []
for record in vcf_reader.fetch():
    gene_info = record.info.get("GENEINFO", [""])
    if "ORC1" in gene_info:
        try:
            variant_info = {
                "CHROM": record.chrom,  # Chromosome
                "POS": record.pos,      # Position
                "ID": record.id,        # Variant ID (e.g., rs number)
                "GENEINFO":gene_info,   # Gene information
                "REF": record.ref,      # Reference allele
                "ALT": ",".join(str(alt) for alt in record.alts),  # Alternate allele(s)
                "CLNREVSTAT": record.info.get("CLNREVSTAT"),  # Review status
                "CLNSIG": record.info.get("CLNSIG"),  # Clinical significance
                "RESULT": record.info.get("MC"),  # Molecular consequence
                "CLNDN": record.info.get("CLNDN")  # Disease name
            }
            variants.append(variant_info)
        except Exception as e:
            print(f"Error processing record at {record.chrom}:{record.pos}:{gene_info} - {e}")
    

# Convert to DataFrame
df = pd.DataFrame(variants)
print(df.shape)

[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaroun

Error processing record at 16:31092475:VKORC1:79001 - 'NoneType' object is not iterable
Error processing record at 16:31096368:VKORC1:79001 - 'NoneType' object is not iterable


[W::vcf_parse] Contig '17' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '18' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '20' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '21' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '22' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'X' is not defined in the header. (Quick workaround: index the file with tabix.)


(445, 10)


[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'MT' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_113889.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187633.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187661.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187693.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NW_009646201.1' is not defined in the header. (Quick workaround: index the file with tabix.)


In [4]:
# List of genes after filtration
set(df['GENEINFO'])

{'MORC1:27136',
 'ORC1:4998',
 'ORC1:4998|LOC126805733:126805733',
 'ORC1:4998|PRPF38A:84950',
 'VKORC1:79001',
 'VKORC1:79001|PRSS53:339105',
 'VKORC1L1:154807'}

In [5]:
# Filterded by gene name
df = df[(df['GENEINFO']=='ORC1:4998|PRPF38A:84950') | (df['GENEINFO']=='ORC1:4998') | (df['GENEINFO']=='ORC1:4998|LOC126805733:126805733')]
print(df.shape)

(339, 10)


Delete:
"no_classification_for_the_single_variant" (2)
no_assertion_criteria_provided (5) with 3 Meier-Gorlin_syndrome_1

conflicting_classifications: if there is Meier-Gorlin_syndrome_1: put Meier-Gorlin_syndrome_1 probably. Else - delete

In [6]:
# Change columns
df["CLNSIG"] = df["CLNSIG"].apply(lambda x: convert_to_str(x))
df["RESULT"] = df["RESULT"].apply(lambda x: convert_to_str(x))
df["CLNDN"] = df["CLNDN"].apply(lambda x: convert_to_str(x))

df["CRITERIA"] = df["CLNREVSTAT"].apply(lambda x: list(x)[0])
df["SUBMITTION"] = df["CLNREVSTAT"].apply(lambda x: second_element(x))
df["RESULT"] = df["RESULT"].apply(lambda x: split_row(x))

df = df.drop(columns=['CLNREVSTAT'])

# Data cleaning
df = df[df["CLNSIG"] != "no_classification_for_the_single_variant"]
df = df[df["CLNSIG"] != "no_classification_for_the_single_variant"]
df = df[df["CRITERIA"] != "no_assertion_criteria_provided"]

# Create new column
df["CLNDN_NEW"] = df["CLNDN"].map(rename_condition_2, na_action='ignore')

# Statistics

In [7]:
df["CLNSIG"].value_counts()

CLNSIG
Uncertain_significance                          158
Likely_benign                                    97
Benign                                           24
Conflicting_classifications_of_pathogenicity     22
Pathogenic                                       13
Benign/Likely_benign                              9
Likely_pathogenic                                 8
Pathogenic/Likely_pathogenic                      1
Name: count, dtype: int64

In [8]:
df["CRITERIA"].value_counts()

CRITERIA
criteria_provided    332
Name: count, dtype: int64

In [9]:
df["SUBMITTION"].value_counts()

SUBMITTION
single_submitter               248
not_specified                   62
conflicting_classifications     22
Name: count, dtype: int64

In [10]:
df["RESULT"].value_counts()

RESULT
missense_variant                     174
synonymous_variant                    77
intron_variant                        56
frameshift_variant                     7
splice_acceptor_variant                5
nonsense                               4
3_prime_UTR_variant                    3
splice_donor_variant                   2
not_specified                          1
inframe_deletion                       1
5_prime_UTR_variant                    1
genic_upstream_transcript_variant      1
Name: count, dtype: int64

In [11]:
df["CLNDN_NEW"].value_counts()

CLNDN_NEW
not_provided                        200
Meier-Gorlin_syndrome_1_probably     46
Inborn_genetic_diseases              33
Meier-Gorlin_syndrome_1              28
Inborn_genetic_diseases_probably     20
ORC1-related_disorder_probably        5
Name: count, dtype: int64

In [20]:
df.groupby(['CLNDN_NEW','CLNSIG']).count()['RESULT']

CLNDN_NEW                         CLNSIG                                      
Inborn_genetic_diseases           Likely_benign                                    3
                                  Likely_pathogenic                                1
                                  Uncertain_significance                          29
Inborn_genetic_diseases_probably  Conflicting_classifications_of_pathogenicity     5
                                  Likely_pathogenic                                1
                                  Uncertain_significance                          14
Meier-Gorlin_syndrome_1           Likely_pathogenic                                2
                                  Pathogenic                                       5
                                  Uncertain_significance                          21
Meier-Gorlin_syndrome_1_probably  Benign                                           9
                                  Benign/Likely_benign                 

In [28]:
df_filtered = df[(df["CLNSIG"]=='Pathogenic') | (df["CLNSIG"]=='Likely_pathogenic')| (df["CLNSIG"]=='Pathogenic/Likely_pathogenic')]
df_filtered.shape

(22, 12)

In [30]:
df_filtered['RESULT'].value_counts()

RESULT
frameshift_variant         7
splice_acceptor_variant    5
missense_variant           5
nonsense                   3
not_specified              1
splice_donor_variant       1
Name: count, dtype: int64