## Libraries

In [1]:
# Standard library imports
import os
import zipfile
from pathlib import Path

# Third-party imports
import ee
import geemap
import pandas as pd
import cdsapi
import dotenv

dotenv.load_dotenv()
EARTH_ENGINE_API = os.getenv("EARTH_ENGINE_API")

## Define Paths

In [2]:
data_dir = Path.cwd().parent / 'data'
climate_data_dir = data_dir / 'climate_data'
lyme_dataset_path = data_dir / 'UKHSA-2017-2022-Lyme-Disease.csv'

## Load Dataset

In [3]:
df = pd.read_csv(lyme_dataset_path)
df.head()

Unnamed: 0,Year,Council,Case,Population,Incidence,Lower_95CI,Upper_95CI
0,2017,Adur,1.0,63721.0,1.56934,0.03973,8.74381
1,2017,Allerdale,3.0,97213.0,3.08601,0.63641,9.01862
2,2017,Amber Valley,4.0,125898.0,3.17718,0.86567,8.13483
3,2017,Arun,9.0,158657.0,5.67262,2.59388,10.76839
4,2017,Ashfield,0.0,126164.0,0.0,0.0,2.92388


## Get Climate Data

In [None]:
def download_uk_climate_data(years: list, output_filename: str = 'uk_climate_data.zip') -> None:
    """
    Download UK climate data using cdsapi
    
    Parameters
    ----------
    years : list
        The list of years as strings, e.g., ['2020', '2021', '2022'].
    output_filename : str
        The name for the output NetCDF file. Default is 'uk_climate_data.zip'.
    """
    c = cdsapi.Client()
    
    print(f"Downloading UK climate data for years: {years}")
    print("This may take several minutes...")
    
    dataset = "reanalysis-era5-land-monthly-means"
    request = {
        "product_type": ["monthly_averaged_reanalysis"],
        "variable": [
            "2m_temperature",
            "total_evaporation",
            "total_precipitation",
            "high_vegetation_cover",
            "type_of_high_vegetation",
        ],
        "year": years,
        "month": ["05", "06", "07"],
        "time": ["00:00"],
        "data_format": "netcdf",
        "download_format": "zip",
        "area": [61, -8, 49, 2],
    }
    
    c.retrieve(dataset, request, output_filename)
    
    print(f"Download complete! Data saved as: {output_filename}")

In [None]:
climate_filename = data_dir / 'climate_data' / 'uk_climate_data.zip'
years = df['Year'].unique().astype(str).tolist()

download_uk_climate_data(years, output_filename=climate_filename)

## Extract Climate Data

In [None]:
def extract_climate_data(zip_filename: str, extract_to: str = 'climate_data') -> None:
    """
    Extracts the contents of a zip file to a specified directory.
    
    Parameters
    ----------
    zip_filename : str
        The name of the zip file to extract.
    extract_to : str
        The directory to extract files into. Default is 'climate_data'.
    """
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    
    os.remove(zip_filename)  # Remove the zip file after extraction
    print(f"Extracted files to: {extract_to}")

In [None]:
climate_data_path = climate_data_dir / 'uk_climate_data.zip'

extract_climate_data(climate_data_path, extract_to=climate_data_dir)

## Alpha Earth

### Authentication

In [4]:
# Authenticate with your Google account
ee.Authenticate()

# Initialize the Earth Engine API with your project ID
ee.Initialize(project='lyme-disease-fc')

### Datasets

In [5]:
dataset = ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")
counties = ee.FeatureCollection("projects/lyme-disease-fc/assets/UK_counties_shapefile")
years = list(range(2017, 2023))

### Get Embeddings

In [6]:
def get_embeddings_by_region(region):
    images = []
    for year in years:
        image = dataset.filterDate(f"{year}-01-01", f"{year}-12-31") \
                        .filterBounds(region.geometry()) \
                        .mean() \
                        .clip(region.geometry()) \
                        .set({'year': year})
        images.append(image)
    return ee.ImageCollection(images)

In [None]:
# 138, 140, 153
embeddings_df = pd.DataFrame()
index = 0 # max = 180
count = 0

for i, county in enumerate(counties.toList(counties.size()).getInfo()):
    if i >= index and count < 10:
        try:
            county = ee.Feature(county)
            name = county.get('UTLA22NM').getInfo()
            geom = county.geometry()

            embeddings = get_embeddings_by_region(county)
            embeddings_list = embeddings.toList(embeddings.size())

            for idx, embedding in enumerate(embeddings_list.getInfo()):
                image = ee.Image(embeddings_list.get(idx))
                mean_embedding = image.reduceRegion(
                    reducer=ee.Reducer.mean(),
                    geometry=geom,
                    scale=10,
                    maxPixels=1e13
                )

                embeddings_dict = mean_embedding.getInfo()
                embeddings_row = pd.DataFrame([embeddings_dict])
                embeddings_row['county_name'] = name
                embeddings_row['year'] = embedding['properties']['year']

                embeddings_df = pd.concat([embeddings_df, embeddings_row], ignore_index=True)
            count += 1
        except Exception as e:
            print('Index', i)
            print('Error: ', e)

embeddings_df.head()

Unnamed: 0,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,...,A56,A57,A58,A59,A60,A61,A62,A63,county_name,year
0,-0.144994,-0.110551,-0.191988,0.042876,0.024406,-0.262709,0.010706,0.045438,0.157234,-0.021024,...,-0.001758,0.010871,-0.109899,0.004977,0.106119,-0.075804,-0.034624,-0.08261,Caerphilly,2017
1,-0.134857,-0.118851,-0.176789,0.010352,0.021756,-0.228063,0.026349,0.018928,0.146268,-0.040896,...,-0.000492,0.010232,-0.078029,0.021327,0.129247,-0.055766,-0.035178,-0.070633,Caerphilly,2018
2,-0.142796,-0.117815,-0.188034,0.022112,0.017943,-0.256318,0.025845,0.023566,0.170378,-0.05593,...,-0.015992,0.010543,-0.083853,-0.010076,0.105615,-0.051421,-0.045532,-0.086165,Caerphilly,2019
3,-0.145003,-0.114763,-0.20131,0.055701,0.022105,-0.258605,0.016915,0.025073,0.179534,-0.057546,...,-0.026693,8.7e-05,-0.073239,0.019753,0.123154,-0.06483,-0.023548,-0.089557,Caerphilly,2020
4,-0.134771,-0.126888,-0.155919,0.026389,-0.00117,-0.267729,-0.004152,0.026006,0.168229,-0.05213,...,-0.022392,0.017017,-0.071474,0.000499,0.121982,-0.060577,-0.053059,-0.076336,Caerphilly,2021


### Merge with Existing Embeddings

In [10]:
embeddings = pd.read_csv(data_dir / 'climate_embeddings.csv')

embeddings = pd.concat([embeddings, embeddings_df], ignore_index=True)
print(f"Duplicates after concatenation: {embeddings.duplicated().sum()}")
if embeddings.duplicated().sum() > 0:
    embeddings = embeddings.drop_duplicates().reset_index(drop=True)
    print(f"Duplicates removed. New shape: {embeddings.shape}")
embeddings.to_csv(data_dir / 'climate_embeddings.csv', index=False)

embeddings.tail()

Duplicates after concatenation: 0


Unnamed: 0,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,...,A56,A57,A58,A59,A60,A61,A62,A63,county_name,year
1105,-0.129867,-0.106086,-0.177683,-0.017971,0.011737,-0.227503,0.024112,0.01732,0.144666,-0.033725,...,-0.011686,0.013868,-0.078914,0.010639,0.128317,-0.049264,-0.032842,-0.072886,Merthyr Tydfil,2018
1106,-0.131026,-0.104253,-0.19342,-0.011229,0.003188,-0.266393,0.018242,0.020327,0.169196,-0.046088,...,-0.03675,0.014469,-0.081649,-0.015101,0.107828,-0.036629,-0.032984,-0.088361,Merthyr Tydfil,2019
1107,-0.135607,-0.105875,-0.207905,0.02941,0.010283,-0.269372,0.012359,0.020218,0.184223,-0.048363,...,-0.035724,-0.001652,-0.074276,0.010421,0.125599,-0.050835,-0.013886,-0.09095,Merthyr Tydfil,2020
1108,-0.127542,-0.110099,-0.166207,0.001816,-0.014886,-0.272528,-0.009839,0.023946,0.174953,-0.045719,...,-0.035281,0.021711,-0.065324,-0.007946,0.120253,-0.046078,-0.041127,-0.077324,Merthyr Tydfil,2021
1109,-0.149517,-0.125875,-0.195127,0.025853,0.002979,-0.256495,0.016022,0.023002,0.18553,-0.049335,...,-0.032186,0.011535,-0.080591,5.4e-05,0.117025,-0.058063,-0.039853,-0.084558,Merthyr Tydfil,2022
