In [3]:
# import the necessary packages
# Deal with downloading the LiPD files
import os
import requests
import zipfile
from pathlib import Path

#To mamipulate LiPD Files
from pylipd.lipd import LiPD

# Analysis
import pyleoclim as pyleo
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap, BoundaryNorm
from matplotlib.cm import ScalarMappable

# To save
import pickle

In [6]:
# Link to LiPD files
temp12_url = 'https://lipdverse.org/Temp12k/current_version/Temp12k1_1_0.zip'

# Download and extract
folder_path = Path("../data/Temp12k")
zip_path = folder_path.with_suffix('.zip')

# Step 1: Check if folder exists, create if not:
folder_path.mkdir(parents=True, exist_ok=True)
# Step 2: Check if folder is empy
if not any(folder_path.iterdir()):
    print("Folder is empty. Downloading data...")

    # Step 3: Download zip file
    response = requests.get(temp12_url)
    with open(zip_path, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded ZIP to {zip_path}")

    # Step 4: Extract zip contents into folder
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(folder_path)
    print(f"Extracted contents to {folder_path}")


@agent code open the temp12k files

In [7]:
# Loads the Temp12k LiPD datasets from the downloaded directory, robustly discovering .lpd/.jsonld files recursively, and opens them into a LiPD object. It also extracts key metadata (dataset names, IDs, archive types) and a locations table for quick inspection.
# Auto-generated by PaleoPal CodeGenerationAgent with contextual search

# Import necessary libraries
from pathlib import Path
from pylipd.lipd import LiPD
import pandas as pd

# Resolve the Temp12k directory path, using an existing variable if available
try:
    folder_path  # noqa: F821  # Check if defined in the existing environment
except NameError:
    folder_path = Path("../data/Temp12k")

# Ensure the directory exists
if not Path(folder_path).exists():
    raise FileNotFoundError(f"Expected Temp12k directory not found at: {folder_path}")

# Recursively locate LiPD files (.lpd or .jsonld) within the directory
lpd_candidates = list(Path(folder_path).rglob("*.lpd")) + list(Path(folder_path).rglob("*.jsonld"))
# Deduplicate and sort
lpd_files = sorted(set([p for p in lpd_candidates]))

print(f"Searching for LiPD files under: {folder_path}")
print(f"Found {len(lpd_files)} candidate LiPD files")

# Initialize the LiPD container and load datasets
temp12k_lipd = LiPD()
if len(lpd_files) > 0:
    temp12k_lipd.load([str(p) for p in lpd_files])
else:
    # Fallback: try loading directly from directory (if the structure is already flat)
    # This is a safeguard; normally rglob should have found files.
    print("No explicit LiPD files found via rglob; attempting load_from_dir as a fallback...")
    temp12k_lipd.load_from_dir(str(folder_path))

# Gather basic information about the compilation
temp12k_dataset_names = temp12k_lipd.get_all_dataset_names()
temp12k_dataset_ids = temp12k_lipd.get_all_dataset_ids()
temp12k_archive_types = temp12k_lipd.get_all_archiveTypes()

# Locations DataFrame (lat/lon, site names, etc.)
temp12k_locations = temp12k_lipd.get_all_locations()

# Report summary to the console
print(f"Loaded datasets: {len(temp12k_dataset_names)}")
print("First 5 dataset names:", temp12k_dataset_names[:5])
print("First 5 dataset ids:", temp12k_dataset_ids[:5])
print("Unique archive types:", sorted(pd.unique(pd.Series(temp12k_archive_types))))
print("Locations table shape:", temp12k_locations.shape)

# Optional: show a quick preview of the locations table (head)
try:
    # .head() returns a DataFrame; printing a small preview is safe for consoles
    print(temp12k_locations.head(10))
except Exception as e:
    print(f"Could not preview locations DataFrame: {e}")


Searching for LiPD files under: ../data/Temp12k
Found 697 candidate LiPD files
Loading 697 LiPD files


100%|██████████| 697/697 [00:34<00:00, 19.97it/s]


Loaded..
Loaded datasets: 697
First 5 dataset names: ['117_723A.Godad.2011', '161_976.Martrat.2014', '165_1002C.Herbert.2000', '2005-804-006.Ledu.2010', '31Lake.Eisner.1995']
First 5 dataset ids: ['tmeZsxjY5oSP1dOcgrJW', 'XZ9fhwh8JAfLsrte80Nk', 'e4AgcBwpIZDtQ0ZX1tWH', 'y02VMsVJrvKNDtwCfrMg', 'h8odczAp1FYZNCGvwxff']
Unique archive types: ['Glacier ice', 'Ground ice', 'Lake sediment', 'Marine sediment', 'Midden', 'Peat', 'Speleothem', 'Wood']
Locations table shape: (697, 4)
              dataSetName  geo_meanLat  geo_meanLon  geo_meanElev
0     117_723A.Godad.2011      18.0518      57.6090        -806.0
1    161_976.Martrat.2014      36.2054      -4.3127       -1108.0
2  165_1002C.Herbert.2000      10.7061     -65.1694        -892.6
3  2005-804-006.Ledu.2010      68.9925    -106.5735        -118.0
4      31Lake.Eisner.1995      67.0500     -50.4700         115.0
5     3MPond.Pellatt.2000      49.9833    -121.2167        1950.0
6    850Lake.Shemesh.2001      68.3700      19.1200         8

@agent code filter the temp12k_locations dataframe for meanLat between 5 and 30 and return the answer in a new dataframe

In [None]:
# Filters the existing temp12k_locations DataFrame to include only rows where meanLat is between 5 and 30 (inclusive). Handles potential string-typed latitude values by coercing them to numeric safely. The result is stored in a new DataFrame named temp12k_locations_meanLat_5_30 and a brief summary is printed.
# Auto-generated by PaleoPal CodeGenerationAgent with contextual search

# Filter temp12k_locations for entries with meanLat between 5 and 30 and store in a new DataFrame
# Ensures robust handling if meanLat is stored as strings

import pandas as pd

# Verify that temp12k_locations exists
try:
    temp12k_locations  # noqa: F821
except NameError as e:
    raise NameError("The variable 'temp12k_locations' is not defined. Please load the Temp12k locations DataFrame before filtering.") from e

# Identify the latitude column, preferring 'meanLat'
if 'meanLat' in temp12k_locations.columns:
    lat_col = 'meanLat'
else:
    # Attempt a case-insensitive match for 'meanLat'
    lower_map = {c.lower(): c for c in temp12k_locations.columns}
    if 'meanlat' in lower_map:
        lat_col = lower_map['meanlat']
    else:
        raise KeyError(f"Could not find a 'meanLat' column in temp12k_locations. Available columns: {list(temp12k_locations.columns)}")

# Coerce latitude values to numeric in case of string types
lat_values = pd.to_numeric(temp12k_locations[lat_col], errors='coerce')

# Build mask for inclusive range [5, 30]
mask = (lat_values >= 5) & (lat_values <= 30)

# Create the filtered DataFrame
temp12k_locations_meanLat_5_30 = temp12k_locations.loc[mask].copy()

# Optional: report summary
print("Filtered locations with meanLat between 5 and 30 (inclusive):")
print("Original shape:", temp12k_locations.shape)
print("Filtered shape:", temp12k_locations_meanLat_5_30.shape)
print(temp12k_locations_meanLat_5_30.head(10))
