# Extract data from the ClinVar Full Release XML file

Pipline:
1. Download the list of Clinvar XML releases and parse this list
2. Download the Latest ClinVar compressed XML File
3. Check your compressed XML file
4. Extract the data for one particular gene and save the into CSF file

In [2]:
# Download the list of Clinvar XML releases
import requests

clinvar_xml_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/"
output_file = "ClinVarFullRelease.xml.gz"

# Stream download
response = requests.get(clinvar_xml_url, stream=True)
if response.status_code == 200:
    with open(output_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete.")
else:
    print(f"Download failed! HTTP status code: {response.status_code}")

Download complete.


In [3]:
# Parse the HTML Page to Extract File Links
import requests
from bs4 import BeautifulSoup

# URL of the ClinVar XML directory
url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/"

# Fetch the HTML page
response = requests.get(url)
html_content = response.text

# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")

# Find all links to .gz files
file_links = []
for link in soup.find_all("a"):
    href = link.get("href")
    if href and href.endswith(".xml.gz"):
        file_links.append(url + href)

# Display extracted file links
for file in file_links:
    print(file)

https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-10.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2024-02.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2024-04.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-02.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-03.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-04.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-05.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-06.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-07.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-08.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-09.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRele

In [4]:
# Download the Latest ClinVar XML File
import os

# Choose the latest file (modify as needed)
latest_file_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2025-01.xml.gz"
output_file = "ClinVarFullRelease_2025-01.xml.gz"

# Download the file
response = requests.get(latest_file_url, stream=True)
if response.status_code == 200:
    with open(output_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded {output_file}")
else:
    print(f"Failed to download file. HTTP Status Code: {response.status_code}")


KeyboardInterrupt: 

In [1]:
# Check that the XML file is correct
import os

file_path = "./data/ClinVarFullRelease_2025-01.xml.gz"

# Check the file size
print(f"File size: {os.path.getsize(file_path)} bytes")

# Check if it's a gzip file
with open(file_path, "rb") as f:
    signature = f.read(2)  # Read the first two bytes

if signature == b'\x1f\x8b':  # GZIP signature
    print("The file is correctly recognized as a GZIP file.")
else:
    print("The file is NOT a GZIP file. The download may be corrupted.")

File size: 4154767604 bytes
The file is correctly recognized as a GZIP file.


## Extract all data from the ClinVar XML file (dont make it :)

In [18]:
# import xml.etree.ElementTree as ET
# import gzip
# import pandas as pd

# # Define XML file path
# xml_file = "ClinVarFullRelease_2025-01.xml.gz"

# # Function to extract Gene Symbol, VariationID, Name, and PubMed ID
# def extract_clinvar_data(xml_file, max_records=None):
#     extracted_data = []

#     with gzip.open(xml_file, "rt", encoding="utf-8") as f:
#         context = ET.iterparse(f, events=("start", "end"))
#         _, root = next(context)  # Get root element

#         record_count = 0
#         for event, elem in context:
#             if event == "end" and elem.tag == "VariationArchive":
#                 variation_id = elem.attrib.get("VariationID", "N/A")
#                 variation_name = elem.attrib.get("VariationName", "N/A")

#                 # Extract Gene Symbol
#                 gene_elem = elem.find(".//Gene")
#                 gene_symbol = gene_elem.attrib.get("Symbol", "N/A") if gene_elem is not None else "N/A"

#                 # Extract PubMed ID
#                 citation_elem = elem.find(".//Citation/ID[@Source='PubMed']")
#                 pubmed_id = citation_elem.text if citation_elem is not None else "N/A"

#                 # Append data to list
#                 extracted_data.append({
#                     "VariationID": variation_id,
#                     "GeneSymbol": gene_symbol,
#                     "Name": variation_name,
#                     "PubMedID": pubmed_id
#                 })

#                 record_count += 1
#                 if max_records and record_count >= max_records:
#                     break  # Stop after max_records for testing

#                 # Free memory
#                 elem.clear()

#     return extracted_data

# # Extract first 100 records for testing
# clinvar_data = extract_clinvar_data(xml_file, max_records=10)

# # Convert results to DataFrame
# df = pd.DataFrame(clinvar_data)

# # Save to CSV
# csv_filename = "ClinVar_Gene_Variants_2025-01.csv"
# df.to_csv(csv_filename, index=False)

## Extract data for particular gene

In [19]:
import xml.etree.ElementTree as ET
import gzip
import pandas as pd

# Define XML file path
xml_file = "ClinVarFullRelease_2025-01.xml.gz"
name_of_gene = "ORC1"

# Function to extract ORC1 Gene Data
def extract_orc1_data(xml_file, max_records=None):
    extracted_data = []

    with gzip.open(xml_file, "rt", encoding="utf-8") as f:
        context = ET.iterparse(f, events=("start", "end"))
        _, root = next(context)  # Get root element

        record_count = 0
        for event, elem in context:
            if event == "end" and elem.tag == "VariationArchive":
                variation_id = elem.attrib.get("VariationID", "N/A")
                variation_name = elem.attrib.get("VariationName", "N/A")

                # Extract Gene Symbol
                gene_elem = elem.find(".//Gene")
                gene_symbol = gene_elem.attrib.get("Symbol", "N/A") if gene_elem is not None else "N/A"

                # Only save if the gene is 'name_of_gene'
                if gene_symbol == name_of_gene:
                    # Extract PubMed ID
                    citation_elem = elem.find(".//Citation/ID[@Source='PubMed']")
                    pubmed_id = citation_elem.text if citation_elem is not None else "N/A"

                    # Append ORC1 data to list
                    extracted_data.append({
                        "VariationID": variation_id,
                        "GeneSymbol": gene_symbol,
                        "Name": variation_name,
                        "PubMedID": pubmed_id
                    })

                    record_count += 1
                    if max_records and record_count >= max_records:
                        break  # Stop after max_records for testing

                # Free memory
                elem.clear()

    return extracted_data

# Extract first N ORC1 records
orc1_data = extract_orc1_data(xml_file, max_records=100000)

# Convert results to DataFrame
df = pd.DataFrame(orc1_data)

# Save to CSV
csv_filename = "ClinVar_ORC1_Gene_Variants_2025-01.csv"
df.to_csv(csv_filename, index=False)


## Example of one XML record

In [15]:
import xml.etree.ElementTree as ET
import gzip

# Define XML file
xml_file = "ClinVarFullRelease_2025-01.xml.gz"

# Define number of records to print
N = 1

# Stream process compressed XML file
def print_first_n_records(xml_file, N):
    with gzip.open(xml_file, "rt", encoding="utf-8") as f:
        context = ET.iterparse(f, events=("start", "end"))
        _, root = next(context)  # Get root element

        record_count = 0
        for event, elem in context:
            if event == "end" and elem.tag in ["ClinVarAssertion", "VariationArchive"]:  
                # Adjust tag name as needed based on your XML structure
                print(ET.tostring(elem, encoding="unicode"))
                record_count += 1
                elem.clear()  # Free memory
                if record_count >= N:
                    break

# Call function to print first N records
print_first_n_records(xml_file, N)


<VariationArchive RecordType="classified" VariationID="2" VariationName="NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTAACTGTAAA (p.Arg27_Ile28delinsLeuLeuTer)" VariationType="Indel" Accession="VCV000000002" Version="3" NumberOfSubmissions="2" NumberOfSubmitters="2" DateLastUpdated="2022-04-25" DateCreated="2017-01-30" MostRecentSubmission="2021-05-16">
  <RecordStatus>current</RecordStatus>
  <Species>Homo sapiens</Species>
  <ClassifiedRecord>
    <SimpleAllele AlleleID="15041" VariationID="2">
      <GeneList>
        <Gene Symbol="AP5Z1" FullName="adaptor related protein complex 5 subunit zeta 1" GeneID="9907" HGNC_ID="HGNC:22197" Source="submitted" RelationshipType="within single gene">
          <Location>
            <CytogeneticLocation>7p22.1</CytogeneticLocation>
            <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="7" Accession="NC_000007.14" start="4775623" stop="4794397" display_start="4775623" display_stop

Top Variants with Strong Negative Effects - These mutations significantly reduce phase separation and RNA-binding ability:

A.174.V 0.64 [ 0.56, 0.72 ] - no citation
R.396.W 0.66 [ 0.52, 0.8 ] - no citation
A.372.V 0.76 [ 0.84, 0.68 ] - 22689986
G.399.D 0.76 [ 0.92, 0.6 ] - no citation
R.549.C 0.98 [ 0.96, 1 ] - no citation

Top Variants with Strong Positive Effects - These mutations enhance phase separation and RNA-binding ability:

E.73.K 0.764706  [ 0.882353, 0.647059 ] - no citation
L.617.P 0.823529 [ 0.941176, 0.705882 ] - no info
C.186.R 0.882352 [ 0.823529, 0.941176 ] - no citation
L.622.P 0.882353 [ 1, 0.764706 ] - no citation

In [2]:
import pandas as pd

path_to_file = "data/ClinVar_ORC1_Gene_Variants_2025-01.csv"

df = pd.read_csv(path_to_file)

In [4]:
df

Unnamed: 0,VariationID,GeneSymbol,Name,PubMedID
0,499854,ORC1,NM_004153.4(ORC1):c.721_721+1del,
1,129861,ORC1,NM_004153.4(ORC1):c.2518G>A (p.Asp840Asn),18414213.0
2,211802,ORC1,NM_004153.4(ORC1):c.703A>G (p.Lys235Glu),25741868.0
3,129853,ORC1,NM_004153.4(ORC1):c.308T>C (p.Leu103Ser),18414213.0
4,211796,ORC1,NM_004153.4(ORC1):c.1083-4G>C,25741868.0
...,...,...,...,...
328,1675366,ORC1,NM_004153.4(ORC1):c.669A>G (p.Gln223=),28492532.0
329,2638815,ORC1,NM_004153.4(ORC1):c.1005A>G (p.Thr335=),28492532.0
330,2638816,ORC1,NM_004153.4(ORC1):c.567C>T (p.Pro189=),
331,129858,ORC1,NM_004153.4(ORC1):c.2405A>G (p.His802Arg),18414213.0


In [5]:
PubMedID_unique = df["PubMedID"].dropna().apply(lambda x: str(x).split('.')[0]).unique().tolist()
PubMedID_unique

['18414213',
 '25741868',
 '22947299',
 '21358633',
 '21358632',
 '21358631',
 '28492532',
 '22689986',
 '16199547',
 '17576681',
 '11477602']

Analysis of publications. DOIs

18414213 - NO
25741868 - NO
22947299 - NO
21358633 - 10.1038/ng.776.
21358632 - 10.1038/ng.775.
21358631 - 10.1038/ng.777.
28492532 - NO
22689986 - 10.1073/pnas.1202249109 (CDC45)
16199547 - 10.1136/jmg.2004.029538 (Splicing)
17576681 - 10.1093/nar/gkm402 (Splicing)
11477602 - 10.1002/ajmg.1452



In [10]:
# Mutation with annotation
sb =df[(df["PubMedID"]==21358633.0) | (df["PubMedID"]==21358632.0) | (df["PubMedID"]==21358631.0) | (df["PubMedID"]==11477602.0) | (df["PubMedID"]==22689986.0)]

# Save to CSV
csv_filename = "./data/ClinVar_ORC1_Gene_Variants_2025-01_selected.csv"
sb.to_csv(csv_filename, index=False)

In [11]:
sb

Unnamed: 0,VariationID,GeneSymbol,Name,PubMedID
12,30230,ORC1,NM_004153.4(ORC1):c.380A>G (p.Glu127Gly),21358633.0
13,30234,ORC1,NM_004153.4(ORC1):c.1999_2000delinsA (p.Val667fs),21358632.0
14,30235,ORC1,NM_004153.4(ORC1):c.1482-2A>G,21358632.0
78,30231,ORC1,NM_004153.4(ORC1):c.266T>C (p.Phe89Ser),21358633.0
79,30233,ORC1,NM_004153.4(ORC1):c.2159G>A (p.Arg720Gln),21358633.0
80,30236,ORC1,NM_004153.4(ORC1):c.1996C>T (p.Arg666Trp),21358631.0
82,129854,ORC1,NM_004153.4(ORC1):c.1115C>T (p.Ala372Val),22689986.0
102,632110,ORC1,NM_004153.4(ORC1):c.2221_2224dup (p.Ser742Ter),21358633.0
139,1069550,ORC1,NM_004153.4(ORC1):c.2231del (p.Gly744fs),21358633.0
140,1075855,ORC1,NM_004153.4(ORC1):c.1330C>T (p.Arg444Ter),21358633.0


In [None]:
# # Define function to process 'Name' column
# def extract_second_word(name):
#     if not isinstance(name, str):  # Handle NaN or non-string values
#         return name
#     words = name.split(' ')
#     if len(words) > 1:
#         return words[1][1:-1]
#     return name

# # List of mutations
# mutations_annotated = sb['Name'].apply(lambda x: extract_second_word(x)).to_list()
# mutations_annotated