In [1]:
# Download the list of Clinvar XML releases
import requests

clinvar_xml_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/"
output_file = "ClinVarFullRelease.xml.gz"

# Stream download
response = requests.get(clinvar_xml_url, stream=True)
if response.status_code == 200:
    with open(output_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete.")
else:
    print(f"Download failed! HTTP status code: {response.status_code}")


Download complete.


In [None]:
# Parse the HTML Page to Extract File Links
import requests
from bs4 import BeautifulSoup

# URL of the ClinVar XML directory
url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/"

# Fetch the HTML page
response = requests.get(url)
html_content = response.text

# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")

# Find all links to .gz files
file_links = []
for link in soup.find_all("a"):
    href = link.get("href")
    if href and href.endswith(".xml.gz"):
        file_links.append(url + href)

# Display extracted file links
for file in file_links:
    print(file)

https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-10.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2024-02.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2024-04.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-02.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-03.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-04.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-05.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-06.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-07.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-08.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-09.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRele

In [6]:
# Download the Latest ClinVar XML File
import os

# Choose the latest file (modify as needed)
latest_file_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2025-01.xml.gz"
output_file = "ClinVarFullRelease_2025-01.xml.gz"

# Download the file
response = requests.get(latest_file_url, stream=True)
if response.status_code == 200:
    with open(output_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded {output_file}")
else:
    print(f"Failed to download file. HTTP Status Code: {response.status_code}")


Downloaded ClinVarFullRelease_2025-01.xml.gz


In [7]:
# Check that the XML file is correct
import os

file_path = "ClinVarFullRelease_2025-01.xml.gz"

# Check the file size
print(f"File size: {os.path.getsize(file_path)} bytes")

# Check if it's a gzip file
with open(file_path, "rb") as f:
    signature = f.read(2)  # Read the first two bytes

if signature == b'\x1f\x8b':  # GZIP signature
    print("The file is correctly recognized as a GZIP file.")
else:
    print("The file is NOT a GZIP file. The download may be corrupted.")

File size: 4154767604 bytes
The file is correctly recognized as a GZIP file.


In [18]:
# import xml.etree.ElementTree as ET
# import gzip
# import pandas as pd

# # Define XML file path
# xml_file = "ClinVarFullRelease_2025-01.xml.gz"

# # Function to extract Gene Symbol, VariationID, Name, and PubMed ID
# def extract_clinvar_data(xml_file, max_records=None):
#     extracted_data = []

#     with gzip.open(xml_file, "rt", encoding="utf-8") as f:
#         context = ET.iterparse(f, events=("start", "end"))
#         _, root = next(context)  # Get root element

#         record_count = 0
#         for event, elem in context:
#             if event == "end" and elem.tag == "VariationArchive":
#                 variation_id = elem.attrib.get("VariationID", "N/A")
#                 variation_name = elem.attrib.get("VariationName", "N/A")

#                 # Extract Gene Symbol
#                 gene_elem = elem.find(".//Gene")
#                 gene_symbol = gene_elem.attrib.get("Symbol", "N/A") if gene_elem is not None else "N/A"

#                 # Extract PubMed ID
#                 citation_elem = elem.find(".//Citation/ID[@Source='PubMed']")
#                 pubmed_id = citation_elem.text if citation_elem is not None else "N/A"

#                 # Append data to list
#                 extracted_data.append({
#                     "VariationID": variation_id,
#                     "GeneSymbol": gene_symbol,
#                     "Name": variation_name,
#                     "PubMedID": pubmed_id
#                 })

#                 record_count += 1
#                 if max_records and record_count >= max_records:
#                     break  # Stop after max_records for testing

#                 # Free memory
#                 elem.clear()

#     return extracted_data

# # Extract first 100 records for testing
# clinvar_data = extract_clinvar_data(xml_file, max_records=10)

# # Convert results to DataFrame
# df = pd.DataFrame(clinvar_data)

# # Save to CSV
# csv_filename = "ClinVar_Gene_Variants_2025-01.csv"
# df.to_csv(csv_filename, index=False)

In [15]:
import xml.etree.ElementTree as ET
import gzip

# Define XML file
xml_file = "ClinVarFullRelease_2025-01.xml.gz"

# Define number of records to print
N = 1

# Stream process compressed XML file
def print_first_n_records(xml_file, N):
    with gzip.open(xml_file, "rt", encoding="utf-8") as f:
        context = ET.iterparse(f, events=("start", "end"))
        _, root = next(context)  # Get root element

        record_count = 0
        for event, elem in context:
            if event == "end" and elem.tag in ["ClinVarAssertion", "VariationArchive"]:  
                # Adjust tag name as needed based on your XML structure
                print(ET.tostring(elem, encoding="unicode"))
                record_count += 1
                elem.clear()  # Free memory
                if record_count >= N:
                    break

# Call function to print first N records
print_first_n_records(xml_file, N)


<VariationArchive RecordType="classified" VariationID="2" VariationName="NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTAACTGTAAA (p.Arg27_Ile28delinsLeuLeuTer)" VariationType="Indel" Accession="VCV000000002" Version="3" NumberOfSubmissions="2" NumberOfSubmitters="2" DateLastUpdated="2022-04-25" DateCreated="2017-01-30" MostRecentSubmission="2021-05-16">
  <RecordStatus>current</RecordStatus>
  <Species>Homo sapiens</Species>
  <ClassifiedRecord>
    <SimpleAllele AlleleID="15041" VariationID="2">
      <GeneList>
        <Gene Symbol="AP5Z1" FullName="adaptor related protein complex 5 subunit zeta 1" GeneID="9907" HGNC_ID="HGNC:22197" Source="submitted" RelationshipType="within single gene">
          <Location>
            <CytogeneticLocation>7p22.1</CytogeneticLocation>
            <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="7" Accession="NC_000007.14" start="4775623" stop="4794397" display_start="4775623" display_stop