In [None]:
###
# @Author             : Monserrat López
# @Date               : 2025-01-10
# @Last Modified Date : 2025-04-21
# @Description        : Code for extracting URLs from XML sitemaps, parsing them into structured components and filter for EU countries
# @Note               : This script is intended for academic research purposes only. Do not use for commercial purposes
#                       Some original raw data used during development is not included in this repository for confidentiality reasons.

In [25]:
# Required libraries 
import xml.etree.ElementTree as ET  # For parsing XML sitemaps
import os                           # For file system operations
import csv                          # For writing CSV outputs
import pandas as pd                 # For structured data handling
import regex as re                  # Extended regular expressions for robust pattern matching

### Parse the sitemap XMLs to extract all datacenter URLs.

In [26]:
# Define input and output paths
input_folder = "../input/DCMap"                      
output_file = "../output/01datacenter_urls.csv" 

In [27]:
# Define helper function to extract <loc> tags from XML sitemap files
def extract_urls_from_file(file_path):
    """
    Parses an XML sitemap and extracts all <loc> tags containing data center URLs.
    Returns a list of URLs or an empty list if the file is unreadable.
    """
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        
        # Extract text content of each <loc> tag
        urls = [url.text.strip() for url in root.findall(".//ns:loc", namespace)]
        return urls
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []

In [28]:
# Collect all .xml sitemap files from the input folder
xml_files = [os.path.join(input_folder, file) for file in os.listdir(input_folder) if file.endswith(".xml")]

In [29]:
# Extract all URLs from each XML file
all_urls = []
for xml_file in xml_files:
    print(f"Processing file: {xml_file}")
    urls = extract_urls_from_file(xml_file)
    all_urls.extend(urls)

Processing file: ../input/DCMap/dcs_3.xml
Processing file: ../input/DCMap/dcs_2.xml
Processing file: ../input/DCMap/dcs_1.xml
Processing file: ../input/DCMap/dcs_5.xml
Processing file: ../input/DCMap/dcs_4.xml
Processing file: ../input/DCMap/dcs_6.xml
Processing file: ../input/DCMap/dcs_7.xml
Processing file: ../input/DCMap/dcs_9.xml
Processing file: ../input/DCMap/dcs_8.xml
Processing file: ../input/DCMap/dcs_10.xml


In [42]:
# Save the list of extracted URLs to a CSV file
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Datacenter URL"])  # CSV header
    for url in all_urls:
        writer.writerow([url])

print(f"Extracted {len(all_urls)} URLs and saved them to '{output_file}'")

Extracted 9850 URLs and saved them to '../output/01datacenter_urls.csv'


### Parse URLs to extract structured fields (country, state, city, name).

In [31]:
# Load the saved URLs for further processing
input_folder = "../output/01datacenter_urls.csv"    

In [32]:
# Load the list of URLs into a pandas DataFrame
data = pd.read_csv(input_folder, encoding='utf-8')

In [33]:
# Define a function to parse structured fields from each URL
def parse_url(url):
    """
    Parses a DataCenterMap URL to extract structured fields:
    - Country
    - State (optional)
    - City
    - DataCenter name

    Returns a tuple with the extracted components.
    """
    pattern = r"https://www\.datacentermap\.com/(?P<country>[^/]+)/(?:(?P<state>[^/]+)/)?(?P<city>[^/]+)/(?P<datacenter>[^/]+)/?"
    match = re.match(pattern, url)
    if match:
        return match.group("country", "state", "city", "datacenter")
    else:
        return None, None, None, None

In [34]:
# Parse each URL and extract structured metadata
structured_data = []
with open(input_folder, mode="r", encoding="utf-8") as infile:
    reader = csv.reader(infile)
    next(reader)  # Skip header row
    for row in reader:
        url = row[0]
        country, state, city, datacenter = parse_url(url)
        structured_data.append([url, country, state, city, datacenter])

In [35]:
# Convert to DataFrame
df = pd.DataFrame(structured_data, columns=["Datacenter URL", "Country", "State", "City", "Datacenter Name"])

### Filter and clean the dataset to retain only EU-based data centers.


In [37]:
OUTPUT_EU_CSV = "../output/02european_datacenters.csv"

# Define EU countries (URL-friendly naming)
eu_countries = {
    'austria', 'belgium', 'bulgaria', 'croatia', 'cyprus', 'czech-republic', 'denmark',
    'estonia', 'finland', 'france', 'germany', 'greece', 'hungary', 'ireland', 'italy',
    'latvia', 'lithuania', 'luxembourg', 'malta', 'the-netherlands', 'poland', 'portugal',
    'romania', 'slovakia', 'slovenia', 'spain', 'sweden'
}

# Filter for EU27 and clean up
df_eu = df[df["Country"].isin(eu_countries)].drop_duplicates()
df_eu = df_eu.drop(columns=["State"], errors="ignore")

# Save filtered dataset
df_eu.to_csv(OUTPUT_EU_CSV, index=False, encoding="utf-8")
print(f"[DONE] Saved filtered EU data to '{OUTPUT_EU_CSV}' with {len(df_eu)} entries.")

[DONE] Saved filtered EU data to '../output/02european_datacenters.csv' with 1795 entries.
