In [None]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
import os

# Define your Evomi proxies (modify with your proxy details)
evomi_proxies = {
    ##proxies
}

# Create a cached session with the proxies
cache_session = requests_cache.CachedSession(
    ".cache", expire_after=-1, proxies=evomi_proxies
)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

In [23]:
# Read the CSV file
clusters_df = pd.read_csv("top_clusters_with_midpoints.csv")
print(clusters_df.head())

    latitude  longitude                              name  \
0  25.791496 -80.379582        QTS · 11234 NW 20th Street   
1  25.793027 -80.381242                EdgeConneX · MIA01   
2  25.796127 -80.379946  Radius DC · 11300 NW 25th Street   
3  27.689285 -99.453613                 MDC Laredo - LDO1   
4  27.690588 -99.449913           MDC Data Centers · LDO1   

                  provider                                       full_address  \
0                      QTS               11234 NW 20th Street, Miami, FL, USA   
1               EdgeConnex               2132 NW 114th Avenue, Miami, FL, USA   
2                 RadiusDC         11300 NW 25th Street, Sweet Water, FL, USA   
3         MDC Data Centers           13619 Cabezut Dr, Laredo, TX, 78045, USA   
4  Media Networks Services  13619 Cabezut Drive, Unitec Industrial Park, L...   

   data_center_count  aggregated_power  aggregated_area  yearbuilt  \
0                  1              2.00          38000.0     2008.0   
1     

In [55]:
import openmeteo_requests
import numpy as np

om = openmeteo_requests.Client()


def enrich_with_weather_data(row, om):
    lat = row.get("cluster_mid_lat")
    lon = row.get("cluster_mid_lon")
    
    start_date = f"1999-01-01"
    end_date = "2025-03-23"

    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "daily": [
            "temperature_2m_mean",
            "relative_humidity_2m_mean",
            "dew_point_2m_mean",
            "precipitation_sum",
        ],
        "timezone": "auto",
    }

    try:
        responses = om.weather_api(
            "https://archive-api.open-meteo.com/v1/archive", params=params
        )
        if not responses:
            return row  # Return original if no data

        response = responses[0]
        daily = response.Daily()

        def extract_avg(index):
            var = daily.Variables(index)
            values = var.ValuesAsNumpy()
            return float(np.nanmean(values)) if values.size > 0 else None

        # Add new average columns to the row
        row["avg_temp_2m"] = extract_avg(0)
        row["avg_humidity_2m"] = extract_avg(1)
        row["avg_dew_point_2m"] = extract_avg(2)
        row["avg_precipitation"] = extract_avg(3)

    except Exception as e:
        print(f"Weather retrieval error @({lat},{lon}): {e}")
        row["avg_temp_2m"] = row["avg_humidity_2m"] = row["avg_dew_point_2m"] = row[
            "avg_precipitation"
        ] = None

    return row

In [58]:
sample_row = {"cluster_mid_lat": 37.7749, "cluster_mid_lon": -122.4194, "YOB": 2010}

result = enrich_with_weather_data(sample_row, om)
print("First daily temperature value (temperature_2m_max):", result)

First daily temperature value (temperature_2m_max): {'cluster_mid_lat': 37.7749, 'cluster_mid_lon': -122.4194, 'YOB': 2010, 'avg_temp_2m': 13.425827980041504, 'avg_humidity_2m': 77.44691467285156, 'avg_dew_point_2m': 9.123583793640137, 'avg_precipitation': 1.656526803970337}


In [59]:
def init_session():
    """
    Initialize a new session using Evomi proxies and a retry cache.
    """
    # Replace these with your actual Evomi proxy details if needed.
    evomi_proxies = {
        "http": "http://dcp.evomi.com:2000:omsmart008:dsilstuCbaicxePOdlHy",
        "https": "http://dcp.evomi.com:2000:omsmart008:dsilstuCbaicxePOdlHy",
    }
    new_cache_session = requests_cache.CachedSession(
        ".cache", expire_after=-1, proxies=evomi_proxies
    )
    new_retry_session = retry(new_cache_session, retries=5, backoff_factor=0.2)
    new_openmeteo = openmeteo_requests.Client(session=new_retry_session)
    return new_cache_session, new_retry_session, new_openmeteo

In [61]:
import time
import random
import os
from tqdm.notebook import tqdm
import pandas as pd

# Clean up existing data first
temp_file = 'cluster_weather_temp.csv'
if os.path.exists(temp_file):
    existing_data = pd.read_csv(temp_file)
    # Remove rows with missing weather data
    existing_data = existing_data.dropna(subset=['avg_temperature', 'avg_humidity', 'avg_dew_point', 'avg_precipitation'])
    # Remove duplicates based on cluster_id
    existing_data = existing_data.drop_duplicates(subset=['cluster_id'])
    # Save the cleaned data
    existing_data.to_csv(temp_file, index=False)
    print(f"Cleaned existing data: {len(existing_data)} valid clusters")
    rows = existing_data.to_dict('records')
    processed_clusters = set(existing_data['cluster_id'])
else:
    rows = []
    processed_clusters = set()

# Get unique clusters and their midpoints
unique_clusters = clusters_df[['cluster', 'cluster_mid_lat', 'cluster_mid_lon', 'yearbuilt']].drop_duplicates()
# Filter out already processed clusters
unique_clusters = unique_clusters[~unique_clusters['cluster'].isin(processed_clusters)]
print(f"{len(unique_clusters)} clusters left to process")

# Initialize the client
om = openmeteo_requests.Client()

# Function to handle API requests with retries
def get_weather_with_retry(temp_row, client, max_retries=3, base_delay=60):
    for attempt in range(max_retries):
        try:
            enriched_data = enrich_with_weather_data(temp_row, client)
            
            # Check if there was an error in the response
            if isinstance(enriched_data, dict) and enriched_data.get('error'):
                print(f"Attempt {attempt+1}/{max_retries}: Got error: {enriched_data.get('reason')}")
                delay = base_delay * (1 + random.random())  # Add jitter
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
                continue
            
            # Check if any required weather data is missing
            required_fields = ['avg_temp_2m', 'avg_humidity_2m', 'avg_dew_point_2m', 'avg_precipitation']
            if any(enriched_data.get(field) is None for field in required_fields):
                print(f"Attempt {attempt+1}/{max_retries}: Missing weather data")
                delay = base_delay * (1 + random.random())
                print(f"Waiting {delay:.1f} seconds before retry...")
                time.sleep(delay)
                continue
                
            return enriched_data
        except Exception as e:
            print(f"Attempt {attempt+1}/{max_retries}: Exception: {str(e)}")
            delay = base_delay * (1 + random.random())
            print(f"Waiting {delay:.1f} seconds before retry...")
            time.sleep(delay)
    
    # If we've exhausted retries
    return {'error': True, 'reason': 'Max retries exceeded'}

# Process clusters in small batches with pauses between batches
batch_size = 5
total_clusters = len(unique_clusters)
failed_requests = []

for batch_start in range(0, total_clusters, batch_size):
    batch_end = min(batch_start + batch_size, total_clusters)
    print(f"Processing batch {batch_start//batch_size + 1} ({batch_start} to {batch_end-1})")
    
    # Process each cluster in the current batch
    for idx, cluster_row in tqdm(unique_clusters.iloc[batch_start:batch_end].iterrows(), 
                                total=batch_end-batch_start):
        cluster_id = cluster_row['cluster']
        
        # Skip if this cluster has already been processed
        if cluster_id in processed_clusters:
            continue
            
        # Create row with cluster info
        new_row = {
            'cluster_id': cluster_id,
            'latitude': cluster_row['cluster_mid_lat'],
            'longitude': cluster_row['cluster_mid_lon'],
            'yearbuilt': cluster_row['yearbuilt']
        }
        
        # Use the enrich function to get weather data
        temp_row = {
            'cluster_mid_lat': cluster_row['cluster_mid_lat'],
            'cluster_mid_lon': cluster_row['cluster_mid_lon'],
            'YOB': cluster_row['yearbuilt']
        }
        
        # Get weather data with retry logic
        enriched_data = get_weather_with_retry(temp_row, om)
        
        # Only save if we got valid data
        if isinstance(enriched_data, dict) and not enriched_data.get('error'):
            # Add weather parameters to the new row
            new_row['avg_temperature'] = enriched_data.get('avg_temp_2m')
            new_row['avg_humidity'] = enriched_data.get('avg_humidity_2m')
            new_row['avg_dew_point'] = enriched_data.get('avg_dew_point_2m')
            new_row['avg_precipitation'] = enriched_data.get('avg_precipitation')
            
            # Only append if we have all the data
            if all(new_row.get(key) is not None for key in ['avg_temperature', 'avg_humidity', 'avg_dew_point', 'avg_precipitation']):
                rows.append(new_row)
                processed_clusters.add(cluster_id)
                
                # Save progress after each successful request
                pd.DataFrame(rows).to_csv(temp_file, index=False)
                print(f"Successfully retrieved data for cluster {cluster_id}")
            else:
                print(f"Retrieved incomplete data for cluster {cluster_id}")
                failed_requests.append({
                    'cluster_id': cluster_id,
                    'latitude': temp_row['cluster_mid_lat'],
                    'longitude': temp_row['cluster_mid_lon'],
                    'yearbuilt': temp_row['YOB'],
                    'error': 'Incomplete data'
                })
        else:
            print(f"Failed to retrieve weather for cluster {cluster_id}")
            failed_requests.append({
                'cluster_id': cluster_id,
                'latitude': temp_row['cluster_mid_lat'],
                'longitude': temp_row['cluster_mid_lon'],
                'yearbuilt': temp_row['YOB'],
                'error': enriched_data.get('reason', 'Unknown error') if isinstance(enriched_data, dict) else 'Unknown error'
            })
        
        # Small delay between requests within a batch
        time.sleep(2)
    
    # Pause between batches to avoid rate limits
    if batch_end < total_clusters:
        pause_time = 60 + random.random() * 20  # 60-80 seconds
        print(f"Completed batch. Pausing for {pause_time:.1f} seconds...")
        time.sleep(pause_time)

# Create the final dataframe from the list of rows
cluster_weather_df = pd.DataFrame(rows)

# Save failed requests for later processing
if failed_requests:
    pd.DataFrame(failed_requests).to_csv('failed_weather_requests.csv', index=False)
    print(f"Saved {len(failed_requests)} failed requests to 'failed_weather_requests.csv'")

# Display the results
print(f"Successfully created weather data for {len(cluster_weather_df)} clusters")
print(f"Failed to get data for {len(failed_requests)} clusters")
display(cluster_weather_df.head())

Cleaned existing data: 3 valid clusters
168 clusters left to process
Processing batch 1 (0 to 4)


  0%|          | 0/5 [00:00<?, ?it/s]

Successfully retrieved data for cluster 3.0
Successfully retrieved data for cluster 4.0
Completed batch. Pausing for 67.0 seconds...


KeyboardInterrupt: 

In [79]:
clusters_df[['cluster', 'cluster_mid_lat', 'cluster_mid_lon']].drop_duplicates().to_csv('getWeatherForthese_clusters.csv', index=False)