In [1]:
# Download the list of Clinvar XML releases
import requests

clinvar_xml_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/"
output_file = "ClinVarFullRelease.xml.gz"

# Stream download
response = requests.get(clinvar_xml_url, stream=True)
if response.status_code == 200:
    with open(output_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete.")
else:
    print(f"Download failed! HTTP status code: {response.status_code}")


Download complete.


In [None]:
# Parse the HTML Page to Extract File Links
import requests
from bs4 import BeautifulSoup

# URL of the ClinVar XML directory
url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/"

# Fetch the HTML page
response = requests.get(url)
html_content = response.text

# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")

# Find all links to .gz files
file_links = []
for link in soup.find_all("a"):
    href = link.get("href")
    if href and href.endswith(".xml.gz"):
        file_links.append(url + href)

# Display extracted file links
for file in file_links:
    print(file)

https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-10.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2024-02.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2024-04.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-02.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-03.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-04.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-05.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-06.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-07.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-08.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2024-09.xml.gz
https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRele

In [6]:
# Download the Latest ClinVar XML File
import os

# Choose the latest file (modify as needed)
latest_file_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_2025-01.xml.gz"
output_file = "ClinVarFullRelease_2025-01.xml.gz"

# Download the file
response = requests.get(latest_file_url, stream=True)
if response.status_code == 200:
    with open(output_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded {output_file}")
else:
    print(f"Failed to download file. HTTP Status Code: {response.status_code}")


Downloaded ClinVarFullRelease_2025-01.xml.gz


In [7]:
# Check that the XML file is correct
import os

file_path = "ClinVarFullRelease_2025-01.xml.gz"

# Check the file size
print(f"File size: {os.path.getsize(file_path)} bytes")

# Check if it's a gzip file
with open(file_path, "rb") as f:
    signature = f.read(2)  # Read the first two bytes

if signature == b'\x1f\x8b':  # GZIP signature
    print("The file is correctly recognized as a GZIP file.")
else:
    print("The file is NOT a GZIP file. The download may be corrupted.")

File size: 4154767604 bytes
The file is correctly recognized as a GZIP file.


In [1]:
import xml.etree.ElementTree as ET
import gzip

# Define the extracted XML file
output_file = "ClinVarFullRelease_2025-01.xml.gz"

# Open and parse the XML file
with gzip.open(output_file, "rt", encoding="utf-8") as f:
    tree = ET.parse(f)
    root = tree.getroot()

# Extract citations for ORC1
orc1_citations = []

for record in root.findall(".//ReferenceClinVarAssertion"):
    gene_symbol = record.find(".//Gene//Symbol")
    
    if gene_symbol is not None and gene_symbol.text == "ORC1":
        citation = record.find(".//Citation")
        
        if citation is not None:
            pubmed_id = citation.find(".//ID").text if citation.find(".//ID") is not None else "N/A"
            orc1_citations.append(pubmed_id)

# Display results
print(f"Extracted {len(orc1_citations)} citations for ORC1 protein")


: 