In [1]:
import pandas as pd
import time
import requests
import os
import random
import json
from datetime import datetime
import re
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
# --- Geocoding current addresses ---

# Configuration
INPUT_FILE = '../output/04eu_datacenters_cleaned_features.csv'
OUTPUT_FILE = '../output/05geocoded_datacenters.csv'
CHECKPOINT_FILE = '../cache/geocoding_checkpoint.json'
CACHE_FILE = '../cache/geocode_cache.json'
BATCH_SIZE = 50
RATE_LIMIT_DELAY = 2.5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
]

GEOCODING_SERVICES = [
    {
        'name': 'Photon',
        'url': 'https://photon.komoot.io/api/',
        'params': lambda address: {'q': address, 'limit': 1},
        'extract': lambda data: (data['features'][0]['geometry']['coordinates'][1], 
                                 data['features'][0]['geometry']['coordinates'][0]) if data.get('features') else (None, None)
    },
    {
        'name': 'Nominatim',
        'url': 'https://nominatim.openstreetmap.org/search',
        'params': lambda address: {'q': address, 'format': 'json', 'limit': 1},
        'extract': lambda data: (data[0]['lat'], data[0]['lon']) if data else (None, None)
    }
]

# Load and save cache
def load_cache():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'r') as f:
            return json.load(f)
    return {}

def save_cache(cache):
    with open(CACHE_FILE, 'w') as f:
        json.dump(cache, f)

# Checkpoint handling
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            return json.load(f)
    return {'last_index': -1, 'geocoded_count': 0, 'timestamp': None, 'service_block_status': {}}

def save_checkpoint(index, geocoded_count, service_block_status=None):
    checkpoint = {
        'last_index': index,
        'geocoded_count': geocoded_count,
        'timestamp': datetime.now().isoformat(),
        'service_block_status': service_block_status or {}
    }
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint, f)

# Geocode with retry and fallback
def geocode_address(address, retries=0, service_index=0, service_block_status=None):
    if service_block_status is None:
        service_block_status = {}
    if retries >= MAX_RETRIES:
        print(f"Max retries reached for address: {address}")
        return None, None, service_block_status

    if service_index >= len(GEOCODING_SERVICES):
        if all(service_block_status.values()):
            wait_time = 300
            print(f"All services blocked. Waiting {wait_time}s...")
            time.sleep(wait_time)
            service_block_status = {s['name']: False for s in GEOCODING_SERVICES}
            return geocode_address(address, retries + 1, 0, service_block_status)
        for i, s in enumerate(GEOCODING_SERVICES):
            if not service_block_status.get(s['name'], False):
                service_index = i
                break
        else:
            service_index = 0

    service = GEOCODING_SERVICES[service_index]
    if service_block_status.get(service['name'], False):
        return geocode_address(address, retries, service_index + 1, service_block_status)

    user_agent = random.choice(USER_AGENTS)
    try:
        headers = {
            'User-Agent': user_agent,
            'Accept': 'application/json',
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': 'https://www.openstreetmap.org/'
        }
        params = service['params'](address)
        print(f"Trying {service['name']} for: {address}")
        response = requests.get(service['url'], params=params, headers=headers, timeout=15)

        if response.status_code == 200:
            data = response.json()
            lat, lon = service['extract'](data)
            if lat and lon:
                service_block_status[service['name']] = False
                return lat, lon, service_block_status
            else:
                print(f"No results from {service['name']} for address: {address}")
                retries += 1
                return geocode_address(address, retries, service_index + 1, service_block_status)
        elif response.status_code == 403:
            print(f"{service['name']} returned 403 Forbidden.")
            service_block_status[service['name']] = True
            return geocode_address(address, retries, service_index + 1, service_block_status)
        elif response.status_code == 429:
            wait_time = BACKOFF_FACTOR ** retries * RATE_LIMIT_DELAY * 5
            print(f"Rate limited by {service['name']}, waiting {wait_time}s...")
            time.sleep(wait_time)
            service_block_status[service['name']] = True
            return geocode_address(address, retries + 1, service_index + 1, service_block_status)
        else:
            print(f"HTTP {response.status_code} from {service['name']}")
            return geocode_address(address, retries, service_index + 1, service_block_status)
    except Exception as e:
        print(f"Exception with {service['name']}: {e}")
        time.sleep(BACKOFF_FACTOR ** retries * RATE_LIMIT_DELAY * 2)
        return geocode_address(address, retries + 1, service_index + 1, service_block_status)

# Main geocoding loop
def main():
    df = pd.read_csv(INPUT_FILE)
    if 'Latitude' not in df.columns:
        df['Latitude'] = None
    if 'Longitude' not in df.columns:
        df['Longitude'] = None

    checkpoint = load_checkpoint()
    geocode_cache = load_cache()
    start_index = checkpoint['last_index'] + 1
    geocoded_count = checkpoint['geocoded_count']
    service_block_status = checkpoint.get('service_block_status', {})
    for s in GEOCODING_SERVICES:
        if s['name'] not in service_block_status:
            service_block_status[s['name']] = False

    total_rows = len(df)
    print(f"Starting at index {start_index}, already geocoded: {geocoded_count}")

    for idx in range(start_index, total_rows):
        row = df.iloc[idx]
        if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
            continue

        address = row['clean_address']
        if pd.isna(address) or address.strip() == '':
            print(f"Skipping row {idx}, empty address")
            continue

        country = row['country'] if 'country' in row else ''
        city = row['city'] if 'city' in row and pd.notna(row['city']) else ''
        address_parts = [address.strip()]
        if city and city.lower() not in address.lower():
            address_parts.append(city.strip())
        if country and country.lower() not in address.lower():
            address_parts.append(country.strip())
        full_address = ", ".join(address_parts)

        if full_address in geocode_cache:
            lat, lon = geocode_cache[full_address]
            print(f"{idx+1}/{total_rows} Loaded from cache: {lat}, {lon}")
        else:
            lat, lon, service_block_status = geocode_address(full_address, service_block_status=service_block_status)
            geocode_cache[full_address] = (lat, lon)

        if lat and lon:
            df.at[idx, 'Latitude'] = lat
            df.at[idx, 'Longitude'] = lon
            geocoded_count += 1
            print(f"{idx+1}/{total_rows} Geocoded: {lat}, {lon}")
        else:
            df.at[idx, 'Latitude'] = "Not identified"
            df.at[idx, 'Longitude'] = "Not identified"
            print(f"{idx+1}/{total_rows} Geocoding failed: {full_address}")

        delay = RATE_LIMIT_DELAY + random.uniform(0.5, 2.0)
        print(f"Waiting {delay:.2f}s before next request...")
        time.sleep(delay)

        # Save progress after each batch or after every 10 rows if there are service blocks
        if (idx + 1) % 10 == 0 or idx == total_rows - 1:
            df.to_csv(OUTPUT_FILE, index=False)
            save_checkpoint(idx, geocoded_count, service_block_status)
            save_cache(geocode_cache)
            print(f"Progress saved at row {idx+1}.")
    

    print(f"Geocoding complete: {geocoded_count}/{total_rows} geocoded.")
    print(f"Results saved to '{OUTPUT_FILE}'")
    save_cache(geocode_cache)
    if os.path.exists(CHECKPOINT_FILE):
        os.remove(CHECKPOINT_FILE)

if __name__ == "__main__":
    main()


Starting at index 0, already geocoded: 0
1/1615 Loaded from cache: 52.455915, 13.3890599
1/1615 Geocoded: 52.455915, 13.3890599
Waiting 3.70s before next request...
2/1615 Loaded from cache: 51.7956268, 11.6021131
2/1615 Geocoded: 51.7956268, 11.6021131
Waiting 3.45s before next request...
3/1615 Loaded from cache: 49.59506585, 6.223383200000001
3/1615 Geocoded: 49.59506585, 6.223383200000001
Waiting 3.68s before next request...
4/1615 Loaded from cache: 53.0747001, 8.8071663
4/1615 Geocoded: 53.0747001, 8.8071663
Waiting 3.54s before next request...
5/1615 Loaded from cache: 53.4121759, -6.3680543
5/1615 Geocoded: 53.4121759, -6.3680543
Waiting 3.89s before next request...
6/1615 Loaded from cache: 41.3641886, 2.1413582
6/1615 Geocoded: 41.3641886, 2.1413582
Waiting 3.57s before next request...
7/1615 Loaded from cache: 52.5274602, 13.3188595
7/1615 Geocoded: 52.5274602, 13.3188595
Waiting 4.49s before next request...
8/1615 Loaded from cache: 48.8972718, 2.3221778
8/1615 Geocoded: 48

KeyboardInterrupt: 

In [4]:
# --- Geocode missing addresses ---

# Extract the failed geocoding attempts:
geocoded_df = pd.read_csv('../output/05geocoded_datacenters.csv')
geocoded_df.rename(columns={"Latitude": "latitude", "Longitude": "longitude"}, inplace=True)

# Extract records that failed to geocode
failed_geocodes = geocoded_df[
    (geocoded_df['latitude'] == "Not identified") | 
    (geocoded_df['longitude'] == "Not identified")
]

# Save to a separate file for analysis
failed_geocodes.to_csv('../output/failed_geocodes.csv', index=False)

print(f"Total records: {len(geocoded_df)}")
print(f"Successfully geocoded: {len(geocoded_df) - len(failed_geocodes)}")
print(f"Failed to geocode: {len(failed_geocodes)}")

Total records: 1615
Successfully geocoded: 1447
Failed to geocode: 168


#### Geocode unresolved entries

In [6]:
import pandas as pd

def integrate_manual_and_centroid_fallback_inline(
    geocoded_path="../output/05geocoded_datacenters.csv",
    manual_filled_path="../output/failed_geocodes_filled.csv",
    output_path="../output/06fixed_geocoded_datacenters.csv"
):
    # Load geocoded dataset and normalize coordinate columns
    geocoded = pd.read_csv(geocoded_path)
    geocoded.rename(columns={"Latitude": "latitude", "Longitude": "longitude"}, inplace=True)
    geocoded['latitude'] = pd.to_numeric(geocoded['latitude'], errors='coerce')
    geocoded['longitude'] = pd.to_numeric(geocoded['longitude'], errors='coerce')

    if 'address_correction_type' not in geocoded.columns:
        geocoded['address_correction_type'] = None

    # Load manually filled failed entries
    filled = pd.read_csv(manual_filled_path, delimiter=';')
    filled['latitude'] = pd.to_numeric(filled['latitude'], errors='coerce')
    filled['longitude'] = pd.to_numeric(filled['longitude'], errors='coerce')

    # --- Update manually geocoded entries only ---
    manual_fixed = filled[filled['latitude'].notna() & filled['longitude'].notna()].copy()
    manual_fixed['address_correction_type'] = 'manual_verified'

    for _, row in manual_fixed.iterrows():
        idx = geocoded[geocoded['datacentername'] == row['datacentername']].index
        if not idx.empty:
            geocoded.loc[idx, ['latitude', 'longitude', 'address_correction_type']] = \
                row[['latitude', 'longitude', 'address_correction_type']].values

    # --- Apply centroid fallback only for unresolved from filled ---
    still_unresolved = filled[filled['latitude'].isna() | filled['longitude'].isna()].copy()
    unresolved_keys = still_unresolved[['datacentername', 'city_normalized', 'country_iso2']]

    # Reconfirm coordinates are numeric
    geocoded['latitude'] = pd.to_numeric(geocoded['latitude'], errors='coerce')
    geocoded['longitude'] = pd.to_numeric(geocoded['longitude'], errors='coerce')

    # Compute centroids from successfully geocoded points
    valid_coords = geocoded[geocoded['latitude'].notna() & geocoded['longitude'].notna()]
    centroids = valid_coords.groupby(['city_normalized', 'country_iso2']).agg({
        'latitude': 'median',
        'longitude': 'median'
    }).reset_index()

    # Merge unresolved with city-level centroids
    fallback = unresolved_keys.merge(centroids, on=['city_normalized', 'country_iso2'], how='left')
    fallback['address_correction_type'] = 'city_centroid'

    for _, row in fallback.iterrows():
        idx = geocoded[geocoded['datacentername'] == row['datacentername']].index
        if not idx.empty:
            geocoded.loc[idx, ['latitude', 'longitude', 'address_correction_type']] = \
                row[['latitude', 'longitude', 'address_correction_type']].values

    # --- Add certainty flags ---
    def get_certainty(row):
        if row['address_correction_type'] == 'manual_verified':
            return 'high'
        elif row['address_correction_type'] == 'city_centroid':
            return 'medium'
        elif pd.isna(row['latitude']) or pd.isna(row['longitude']):
            return 'low'
        else:
            return 'high'

    geocoded['location_certainty'] = geocoded.apply(get_certainty, axis=1)

    # Save final dataset
    geocoded.to_csv(output_path, index=False)
    print(f"Final geocoded dataset saved to: {output_path}")
    print(f"Total records: {len(geocoded)}")
    print(f"High certainty: {(geocoded['location_certainty'] == 'high').sum()}")
    print(f"Medium certainty: {(geocoded['location_certainty'] == 'medium').sum()}")
    print(f"Low certainty (still unresolved): {(geocoded['location_certainty'] == 'low').sum()}")

    return geocoded

# Run the function
geocoded_df = integrate_manual_and_centroid_fallback_inline()

Final geocoded dataset saved to: ../output/06fixed_geocoded_datacenters.csv
Total records: 1615
High certainty: 1564
Medium certainty: 49
Low certainty (still unresolved): 2


In [7]:
# Load final geocoded dataset
df_dcm = pd.read_csv("../output/06fixed_geocoded_datacenters.csv")

In [8]:
still_unsolved = df_dcm[
    (df_dcm["latitude"].isna()) |
    (df_dcm["longitude"].isna()) |
    (df_dcm["location_certainty"] == "low")
]

Unnamed: 0,url,address,clean_address,country_iso2,country_normalized,city_normalized,website,description,specs,region,...,pue_estimate,power_built_out_mw,live_power_mw,whitespace_sqm,building_size_sqm,tier_level,latitude,longitude,address_correction_type,location_certainty
94,https://www.datacentermap.com/germany/chemnitz...,envia TEL GmbH\nwithin 2 km from the city cent...,"within 2 km from the city centre, 09114 Chemín...",DE,Germany,Chemnitz,https://www.datacentermap.com/visit/datacenter...,If you are interested in a location to house y...,No data supplied by envia TEL GmbH,Western Europe,...,,,,,,,,,city_centroid,medium
148,https://www.datacentermap.com/germany/hof/dc-h...,"noris network AG\nwithin 1,4 miles away from c...","within 1,4 miles away from central station, 95...",DE,Germany,Hof,https://www.datacentermap.com/visit/datacenter...,YOUR IT LOCATION IN NORTHERN BAVARIA – CERTIFI...,No data supplied by noris network AG,Western Europe,...,,,,,,,,,city_centroid,medium
191,https://www.datacentermap.com/finland/kouvola/...,"atNorth\nwithin in Myllykoski, Kouvola\n46800 ...","within in Myllykoski, Kouvola, 46800 Myllykosk...",FI,Finland,Kouvola,https://www.datacentermap.com/visit/datacenter...,atNorth’s FIN04 mega site campus is located in...,Fully Built-Out Power 60 MW,Northern Europe,...,,60.0,,,,,,,city_centroid,medium
193,https://www.datacentermap.com/sweden/solleftea...,atNorth\nHamre Industripark\n882 91 Långsele\n...,"Hamre Industripark, 882 91 Långsele, Sweden",SE,Sweden,Solleftea,https://www.datacentermap.com/visit/datacenter...,atNorth has secured a 30-hectare plot in Solle...,Fully Built-Out Power 200 MW,Northern Europe,...,,200.0,,,,,,,city_centroid,medium
560,https://www.datacentermap.com/germany/jena/isp...,"ISPpro Internet KG\nwithin Jena North, reachab...","within Jena North, reachable via ISPpro - Head...",DE,Germany,Jena,https://www.datacentermap.com/visit/datacenter...,Data CentersGermanyJena ISPpro RZJ1-B ISPpro I...,No data supplied by ISPpro Internet KG,Western Europe,...,,,,,,,,,city_centroid,medium
683,https://www.datacentermap.com/sweden/jokkmokk/...,Etix Everywhere\nwithin 1 km from village cent...,"within 1 km from village centre, 98260 Porjus,...",SE,Sweden,Jokkmokk,https://www.datacentermap.com/visit/datacenter...,Data CentersSwedenJokkmokk ETIX Jokkmokk #1 Et...,No data supplied by Etix Everywhere,Northern Europe,...,,,,,,3.0,,,city_centroid,medium
691,https://www.datacentermap.com/france/olonne-su...,Etix Everywhere\nwithin 5 km from the city cen...,"within 5 km from the city centre, 85340 Olonne...",FR,France,Olonne sur Mer,https://www.datacentermap.com/visit/datacenter...,Data CentersFranceOlonne sur Mer Etix Olona #1...,No data supplied by Etix Everywhere,Western Europe,...,1.3,,,,,1.0,,,city_centroid,medium
897,https://www.datacentermap.com/cyprus/paphos/ns...,NetShop Internet Services Ltd\nwithin 5km from...,"within 5km from highway, 8103 Paphos, Cyprus, ...",CY,Cyprus,Paphos,https://www.datacentermap.com/visit/datacenter...,Data CentersCyprusPaphos NetShop NSCY03 NetSho...,No data supplied by NetShop Internet Services Ltd,Southern Europe,...,,,,,,3.0,,,city_centroid,medium
962,https://www.datacentermap.com/cyprus/anatoliko...,PrimeTel PLC\nwithin 2 Km from Highway\n8011 P...,"within 2 Km from Highway, 8011 Paphos, Cyprus,...",CY,Cyprus,Anatoliko,https://www.datacentermap.com/visit/datacenter...,Data CentersCyprusAnatoliko Paphos (Landing St...,No data supplied by PrimeTel PLC,Southern Europe,...,,,,,,3.0,,,city_centroid,medium
992,https://www.datacentermap.com/poland/grodzisk-...,Cellnex\nwithin 3 km from the city centre\n05-...,"within 3 km from the city centre, 05-825 Grodz...",PL,Poland,Grodzisk Mazowiecki,https://www.datacentermap.com/visit/datacenter...,Data CentersPolandGrodzisk Mazowiecki Plus Dat...,No data supplied by Cellnex,Eastern Europe,...,,,,,,,,,city_centroid,medium


#### Visualization & Validation

In [51]:
# !pip install geopandas matplotlib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import matplotlib.pyplot as plt

# Drop rows with missing coordinates
df_valid = df_dcm[df_dcm['latitude'].notna() & df_dcm['longitude'].notna()]

# Create geometry column
geometry = [Point(xy) for xy in zip(df_valid["longitude"], df_valid["latitude"])]
gdf = gpd.GeoDataFrame(df_valid, geometry=geometry, crs="EPSG:4326")

In [11]:
nuts = gpd.read_file("../input/GeoData/NUTS/NUTS_RG_01M_2024_4326/NUTS_RG_01M_2024_4326.shp")
nuts = nuts.to_crs("EPSG:4326")  # Make sure CRS matches the dataset

In [12]:
#  Filter for NUTS level 0 (countries)
nuts0 = nuts[nuts['LEVL_CODE'] == 0]

In [13]:
# Filter to EU27 countries
eu27_iso2 = ['AT', 'BE', 'BG', 'HR', 'CY', 'CZ', 'DK', 'EE', 'FI', 'FR', 'DE',
             'GR', 'HU', 'IE', 'IT', 'LV', 'LT', 'LU', 'MT', 'NL', 'PL', 'PT',
             'RO', 'SK', 'SI', 'ES', 'SE']

eu27 = nuts0[nuts0['CNTR_CODE'].isin(eu27_iso2)]

In [14]:
# Perform spatial join to validate
joined = gpd.sjoin(gdf, eu27, how="left", predicate="within")
outside_eu = joined[joined['CNTR_CODE'].isna()]
print(f"{len(outside_eu)} entries outside EU27.")

17 entries outside EU27.


In [19]:
# There is one with a wrong location, I will update it manually here

import pandas as pd

# Load the dataset
df = pd.read_csv('../output/06fixed_geocoded_datacenters.csv')

# Identify the rogue entry
rogue_name = "akton-zagreb"   

# Fix it manually
df.loc[df['datacentername'] == rogue_name, 'latitude'] = 45.74487627617545
df.loc[df['datacentername'] == rogue_name, 'longitude'] = 16.0016386835062
df.loc[df['datacentername'] == rogue_name, 'country_iso2'] = 'HR'
df.loc[df['datacentername'] == rogue_name, 'country_normalized'] = 'Croatia'
df.loc[df['datacentername'] == rogue_name, 'city_normalized'] = 'Zagreb'
df.loc[df['datacentername'] == rogue_name, 'address_correction_type'] = 'manual_verified'
df.loc[df['datacentername'] == rogue_name, 'location_certainty'] = 'high'

# Save corrected dataset
df.to_csv('../output/07geocoded_with_manual_fix.csv', index=False)

print("Manual fix applied to:", rogue_name)

Manual fix applied to: akton-zagreb


In [20]:
# Drop rows with missing coordinates
df_clean = df[df['latitude'].notna() & df['longitude'].notna()].copy()

# Save unresolved entries for review
missing_coords = df[df['latitude'].isna() | df['longitude'].isna()]
print(f"Saved {len(missing_coords)} unresolved entries to 99_geocoding_failed_rows.csv")

Saved 15 unresolved entries to 99_geocoding_failed_rows.csv


In [None]:
# Export cleaned, validated EU data centers for modeling
df_clean.to_csv("../output/08_DC_EU27.csv", index=False, encoding="utf-8")
print(f"Final cleaned dataset saved to 08_DC_EU27.csv with {len(df_clean)} records.")