In [1]:
from clinvar_functions import *
import pandas as pd
import matplotlib.pyplot as plt
import pysam

In [184]:
# Path to ClinVar VCF file
vcf_file = "./data/clinvar.vcf"

# Open the VCF file
vcf_reader = pysam.VariantFile(vcf_file)

Fields: ['ALLELEID',
 'CLNDISDB',
 'CLNDN',
 'CLNHGVS',
 'CLNREVSTAT',
 'CLNSIG',
 'CLNSIGSCV',
 'CLNVC',
 'CLNVCSO',
 'CLNVI',
 'GENEINFO',
 'ORIGIN']

In [185]:
# Extract information from first 10 variants
variants = []
for record in vcf_reader.fetch():
    gene_info = record.info.get("GENEINFO", [""])
    if "ORC1" in gene_info:
        try:
            variant_info = {
                "CHROM": record.chrom,  # Chromosome
                "POS": record.pos,      # Position
                "ID": record.id,        # Variant ID (e.g., rs number)
                "GENEINFO":gene_info,   # Gene information
                "REF": record.ref,      # Reference allele
                "ALT": ",".join(str(alt) for alt in record.alts),  # Alternate allele(s)
                "CLNREVSTAT": record.info.get("CLNREVSTAT"),  # Review status
                "CLNSIG": record.info.get("CLNSIG"),  # Clinical significance
                "RESULT": record.info.get("MC"),  # Molecular consequence
                "CLNDN": record.info.get("CLNDN")  # Disease name
            }
            variants.append(variant_info)
        except Exception as e:
            print(f"Error processing record at {record.chrom}:{record.pos}:{gene_info} - {e}")
    

# Convert to DataFrame
df = pd.DataFrame(variants)
print(df.shape)

[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaroun

Error processing record at 16:31092475:VKORC1:79001 - 'NoneType' object is not iterable
Error processing record at 16:31096368:VKORC1:79001 - 'NoneType' object is not iterable


[W::vcf_parse] Contig '17' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '18' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '20' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '21' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '22' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'X' is not defined in the header. (Quick workaround: index the file with tabix.)


(445, 10)


[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'MT' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_113889.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187633.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187661.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NT_187693.1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'NW_009646201.1' is not defined in the header. (Quick workaround: index the file with tabix.)


In [186]:
# List of genes after filtration
set(df['GENEINFO'])

{'MORC1:27136',
 'ORC1:4998',
 'ORC1:4998|LOC126805733:126805733',
 'ORC1:4998|PRPF38A:84950',
 'VKORC1:79001',
 'VKORC1:79001|PRSS53:339105',
 'VKORC1L1:154807'}

In [187]:
# Filterded by gene name
df = df[(df['GENEINFO']=='ORC1:4998|PRPF38A:84950') | (df['GENEINFO']=='ORC1:4998') | (df['GENEINFO']=='ORC1:4998|LOC126805733:126805733')]
print(df.shape)

(339, 10)


In [188]:
# Functions for data cleaning
def convert_to_str(x):
    if x is not None:
        return x[0]
    return 'not_specified'

def second_element(x):
    li = list(x)
    if len(li) == 2:
        return li[1][1:]
    return 'not_specified'

def split_row(row):
    try:
        return row.split("|")[1]
    except:
        return row

In [189]:
df["CLNSIG"] = df["CLNSIG"].apply(lambda x: convert_to_str(x))
df["RESULT"] = df["RESULT"].apply(lambda x: convert_to_str(x))
df["CLNDN"] = df["CLNDN"].apply(lambda x: convert_to_str(x))

df["CRITERIA"] = df["CLNREVSTAT"].apply(lambda x: list(x)[0])
df["SUBMITTION"] = df["CLNREVSTAT"].apply(lambda x: second_element(x))
df["RESULT"] = df["RESULT"].apply(lambda x: split_row(x))

df = df.drop(columns=['CLNREVSTAT'])

In [193]:
df

Unnamed: 0,CHROM,POS,ID,GENEINFO,REF,ALT,CLNSIG,RESULT,CLNDN,CRITERIA,SUBMITTION
0,1,52369375,1173067,ORC1:4998,TGTACATCTCCCATATTGCAACACCTCCCTTTCCCAAAGATGTACC...,T,Pathogenic,not_specified,Meier-Gorlin_syndrome_1,criteria_provided,single_submitter
1,1,52372950,297574,ORC1:4998,G,A,Benign,3_prime_UTR_variant,Meier-Gorlin_syndrome_1|not_provided,criteria_provided,not_specified
2,1,52372955,297575,ORC1:4998,C,G,Uncertain_significance,3_prime_UTR_variant,Meier-Gorlin_syndrome_1,criteria_provided,single_submitter
3,1,52373087,297576,ORC1:4998,G,A,Uncertain_significance,3_prime_UTR_variant,Meier-Gorlin_syndrome_1,criteria_provided,single_submitter
4,1,52373187,703217,ORC1:4998,G,A,Conflicting_classifications_of_pathogenicity,synonymous_variant,Meier-Gorlin_syndrome_1|not_provided|ORC1-rela...,criteria_provided,conflicting_classifications
...,...,...,...,...,...,...,...,...,...,...,...
334,1,52402240,875878,ORC1:4998,T,A,Uncertain_significance,intron_variant,Meier-Gorlin_syndrome_1,criteria_provided,single_submitter
335,1,52404418,875879,ORC1:4998|LOC126805733:126805733,G,A,Uncertain_significance,5_prime_UTR_variant,Meier-Gorlin_syndrome_1,criteria_provided,single_submitter
336,1,52404439,297590,ORC1:4998|LOC126805733:126805733,G,C,Uncertain_significance,genic_upstream_transcript_variant,Meier-Gorlin_syndrome,criteria_provided,single_submitter
337,1,52408585,3425738,ORC1:4998|PRPF38A:84950,G,C,Uncertain_significance,missense_variant,not_specified,criteria_provided,single_submitter


# Statistics and data cleaning

In [208]:
df["CLNSIG"].value_counts()

CLNSIG
Uncertain_significance                          159
Likely_benign                                    97
Benign                                           24
Conflicting_classifications_of_pathogenicity     22
Pathogenic                                       16
Likely_pathogenic                                 9
Benign/Likely_benign                              9
no_classification_for_the_single_variant          2
Pathogenic/Likely_pathogenic                      1
Name: count, dtype: int64

In [205]:
df["CRITERIA"].value_counts()

CRITERIA
criteria_provided                           332
no_assertion_criteria_provided                5
no_classification_for_the_single_variant      2
Name: count, dtype: int64

In [206]:
df["SUBMITTION"].value_counts()

SUBMITTION
single_submitter               248
not_specified                   69
conflicting_classifications     22
Name: count, dtype: int64

In [207]:
df["RESULT"].value_counts()

RESULT
missense_variant                     177
synonymous_variant                    77
intron_variant                        56
frameshift_variant                    10
splice_acceptor_variant                6
nonsense                               4
3_prime_UTR_variant                    3
splice_donor_variant                   2
not_specified                          1
inframe_deletion                       1
5_prime_UTR_variant                    1
genic_upstream_transcript_variant      1
Name: count, dtype: int64

Delete:
"no_classification_for_the_single_variant" (2)
no_assertion_criteria_provided (5) with 3 Meier-Gorlin_syndrome_1

conflicting_classifications: if there is Meier-Gorlin_syndrome_1: put Meier-Gorlin_syndrome_1 probably. Else - delete

In [221]:
# Data cleaning
df = df[df["CLNSIG"] != "no_classification_for_the_single_variant"]
df = df[df["CLNSIG"] != "no_classification_for_the_single_variant"]
df = df[df["CRITERIA"] != "no_assertion_criteria_provided"]

In [222]:
df.shape

(332, 11)