In [None]:
!pip install -q langchain_core langchain-openai langgraph

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.5/140.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%cd /content/drive/MyDrive/Agent

/content/drive/MyDrive/Agent


In [None]:
# LLM
from langchain_openai import ChatOpenAI
from google.colab import userdata
llm = ChatOpenAI(model="gpt-4o", api_key=userdata.get('OPENAI_API_KEY'))

In [None]:
from pydantic import BaseModel, Field
from typing import Optional, Literal

class QueryIntent(BaseModel):
    goal: Literal["forecast", "research"] = Field(
        description="""
        'forecast': the user wants a contextual overview or analysis. May include NDVI forecasts and real-time data if a location is mentioned.
        'research': the user is asking a general knowledge question not tied to a specific place or data fetch.
        """
    )
    location: Optional[str] = Field(
        description="The geographic location specified in the query, if any. Return None if not found."
    )
    state: Optional[str] = Field(
    description="""
    The 2-letter U.S. state abbreviation (e.g., 'FL', 'TX') associated with the location,
    if known or inferable from the query. Helps scope station lookups regionally.
    Return None if the query doesn't reference a U.S. state.
    """
)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage

system_prompt = SystemMessage(content="""
You are an intelligent assistant that extracts user intent from natural language.

Return:
- goal:
    - 'forecast' if the user is asking for an overview, condition, or analysis of mangroves, especially in a specific location.
    - 'research' if the user asks general questions like "What are mangroves?" or "Why are they important?"

- location:
    - If the user specifies a geographic location (e.g., city, region, island, or landmark), extract it.
    - Return None if no clear location is mentioned.

- state:
    - If the location is in the United States, extract the 2-letter state abbreviation (e.g., "FL" for Florida, "LA" for Louisiana).
    - Return None if the location is outside the U.S. or cannot be inferred.

Only use the allowed values for 'goal'. Do not guess location if it's unclear.
""")

structured_llm = llm.with_structured_output(QueryIntent)

In [None]:
def extract_intent_node(state: dict) -> dict:
    user_query = state["user_query"]

    intent = structured_llm.invoke([system_prompt, user_query])

    return {
        "goal": intent.goal,
        "location": intent.location,
        "state": intent.state
    }


In [None]:
from typing_extensions import TypedDict

class State(TypedDict):
    user_query: str
    goal: Optional[str]
    location: Optional[str]
    state: Optional[str]

In [None]:
# from langgraph.graph import StateGraph, START

# graph = StateGraph(State)

# graph.add_node("extract_intent", extract_intent_node)
# graph.add_edge(START, "extract_intent")

In [None]:
# Step 5: Simulate test input
test_state_1 = {
    "user_query": "How are mangroves doing in Key West?"
}
test_state_2 = {
    "user_query": "Why are mangroves important for climate change?"
}


# Step 6: Run the node
output_1 = extract_intent_node(test_state_1)
output_2 = extract_intent_node(test_state_2)

# Step 7: Print and verify output
print("=== Extracted Intent ===")
print(f"Output 1: {output_1}")
print(f"Output 2: {output_2}")


=== Extracted Intent ===
Output 1: {'goal': 'summary', 'location': 'Key West', 'state': 'FL'}
Output 2: {'goal': 'qa', 'location': None, 'state': None}


In [None]:
import requests

def geocode_location(location_name: str):
    url = f"https://nominatim.openstreetmap.org/search?q={location_name}&format=json"
    r = requests.get(url, headers={"User-Agent": "LangGraph-Mangrove-Agent"})
    data = r.json()
    if not data:
        raise ValueError(f"Could not geocode location: {location_name}")
    lat = float(data[0]["lat"])
    lon = float(data[0]["lon"])
    return (lat, lon)


In [None]:
from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))


### Scrape Turbidity Stations (USGS)

In [None]:
# import requests
# import pandas as pd
# from io import StringIO
# import time

# # All U.S. state codes
# state_codes = [
#     "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID",
#     "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
#     "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK",
#     "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV",
#     "WI", "WY"
# ]

# base_url = "https://waterservices.usgs.gov/nwis/site/?format=rdb&parameterCd=63680&stateCd={}"

# all_sites = []

# for state in state_codes:
#     print(f"Fetching stations for {state}")
#     url = base_url.format(state)
#     r = requests.get(url)

#     # USGS RDB format: skip header lines that start with "#"
#     data = "\n".join(line for line in r.text.splitlines() if not line.startswith("#"))
#     df = pd.read_csv(StringIO(data), sep="\t")

#     # Only keep relevant fields
#     if not df.empty:
#         df = df[["site_no", "station_nm", "dec_lat_va", "dec_long_va"]]
#         df["state"] = state
#         all_sites.append(df)

#     time.sleep(1)  # be respectful of USGS servers

# # Combine and save
# turbidity_stations = pd.concat(all_sites, ignore_index=True)
# turbidity_stations.to_csv("usgs_turbidity_stations.csv", index=False)
# print("✅ Saved: usgs_turbidity_stations.csv")


In [None]:
# !rm -rf usgs_turbidity_stations.csv

In [None]:
# turbidity_stations = turbidity_stations[turbidity_stations['site_no']!='15s']

In [None]:
# turbidity_stations = turbidity_stations.dropna(subset=["dec_lat_va", "dec_long_va"])

In [None]:
# turbidity_stations = turbidity_stations.rename(columns={'station_nm':'name', 'dec_lat_va':'lat', 'dec_long_va':'long'})

In [None]:
# turbidity_stations['location'] = turbidity_stations['name'].str.split(',', n=1, expand=True)[0].str.title()
# turbidity_stations['location'] = turbidity_stations['name'].str.replace(r',.*$|, AL\.$| AL\.$', '', regex=True)

In [None]:
# turbidity_stations['name'] = turbidity_stations['location'] + ', ' + turbidity_stations['state']

In [None]:
# turbidity_stations = turbidity_stations[['site_no', 'name', 'location', 'state', 'lat', 'long']]

In [None]:
# turbidity_stations.head()

In [None]:
# turbidity_stations.to_csv("turbidity_stations.csv", index=False)

In [None]:
# turbidity_stations.info()

In [None]:
# turbidity_stations.describe()

In [None]:
# turbidity_stations = turbidity_stations.rename(columns={"site_no":"station_id"})
# turbidity_stations = turbidity_stations.drop(columns={'lat', 'long'})

In [None]:
# turbidity_stations.to_csv("turbidity_stations.csv", index=None)

### Scrape Wind Speed Stations (NOAA)

In [None]:
# wind_stations = pd.read_csv('wind_speed_stations.csv')

In [None]:
# wind_stations = wind_stations[['station_id', 'name']]

In [None]:
# wind_stations.tail()

In [None]:
# wind_stations[['location', 'state']] = wind_stations['name'].str.rsplit(',', n=1, expand=True)

# # Strip any whitespace from the new columns
# wind_stations['location'] = wind_stations['location'].str.strip()
# wind_stations['state'] = wind_stations['state'].str.strip()

In [None]:
# wind_stations = wind_stations.drop(columns={'lat', 'long'})

In [None]:
# wind_stations.head()

In [None]:
# wind_stations.to_csv("wind_speed_stations_data.csv", index=None)

In [None]:
# # Install required package if not already installed
# # !pip install geopy

# import pandas as pd
# from geopy.geocoders import Nominatim
# from geopy.exc import GeocoderTimedOut, GeocoderServiceError
# import time

# # Function to get coordinates with error handling and retry
# def get_coordinates(location, state, max_retries=3):
#     geolocator = Nominatim(user_agent="wind_station_geocoder")
#     search_query = f"{location}, {state}, USA"

#     for attempt in range(max_retries):
#         try:
#             location_info = geolocator.geocode(search_query)
#             if location_info:
#                 return location_info.latitude, location_info.longitude
#             # If no result, try with just the location name
#             if attempt == max_retries - 1:
#                 location_info = geolocator.geocode(location)
#                 if location_info:
#                     return location_info.latitude, location_info.longitude
#             time.sleep(1)  # Respect API rate limits
#         except (GeocoderTimedOut, GeocoderServiceError):
#             time.sleep(2)  # Wait longer on error

#     return None, None  # Return None if all attempts fail

# # Apply geocoding to the DataFrame
# def add_coordinates(df):
#     # Create new columns for latitude and longitude
#     df['lat'] = None
#     df['long'] = None

#     # Iterate through rows and geocode each location
#     for idx, row in df.iterrows():
#         lat, lng = get_coordinates(row['location'], row['state'])
#         df.at[idx, 'lat'] = lat
#         df.at[idx, 'long'] = lng
#         time.sleep(1)  # Be nice to the geocoding service

#         # Print progress
#         print(f"Geocoded: {row['location']}, {row['state']} → ({lat}, {lng})")

#     return df

# # Apply to your wind_stations DataFrame
# wind_stations_with_coords = add_coordinates(wind_stations)

# # Display results
# print(wind_stations_with_coords[['station_id', 'name', 'location', 'state', 'lat', 'long']])

In [None]:
# wind_stations_with_coords.to_csv("wind_stations_with_coords.csv", index=False)
# # turbidity_stations.to_csv("usgs_turbidity_stations.csv", index=False)

### Scrape Water Level Stations (NOAA)

In [None]:
# water_stations = pd.read_csv('water_level_stations.csv')

In [None]:
# water_stations = water_stations[['station_id', 'name']]

In [None]:
# water_stations = water_stations.dropna()
# water_stations['station_id'] = water_stations['station_id'].astype(int)

In [None]:
# water_stations.head()

In [None]:
# water_stations.tail()

In [None]:
# water_stations[['location', 'state']] = water_stations['name'].str.rsplit(',', n=1, expand=True)

# # Strip any whitespace from the new columns
# water_stations['location'] = water_stations['location'].str.strip()
# water_stations['state'] = water_stations['state'].str.strip()

In [None]:
# water_stations.to_csv("water_level_stations.csv", index=None)

# Process

In [None]:
import pandas as pd

def load_stations(csv_path: str, state_abbr: str) -> list[dict]:
    df = pd.read_csv(csv_path)

    # Ensure state column exists and is properly formatted
    if "state" not in df.columns:
        raise ValueError("Missing 'state' column in station CSV.")

    filtered = df[df["state"].str.upper() == state_abbr.upper()]
    return filtered.to_dict(orient="records")  # for LLM ranking


In [None]:
def ask_llm_rank_stations(location: str, station_list: list[dict], variable: str) -> list[dict]:
    station_names = [s['name'] for s in station_list]

    prompt = f"""
    The user is asking about: {location}
    Sensor type: {variable}

    Here are some stations in the same U.S. state:
    {station_names}

    Rank the top 5 stations that are most relevant or closest to the location. Respond with a list of names (copy them exactly from the list).
    """

    response = llm.invoke(prompt).content
    print(f"=== Top 5 Nearest {variable.title()} Station ===") # Raw LLM Output
    print(response)
    print()

    # Normalize and remove list numbering (e.g., "1. Station Name")
    import re
    ranked_names = [
        re.sub(r"^\d+\.\s*", "", name.strip().lower())
        for name in response.split("\n")
        if name.strip()
    ]

    matched = []
    for ranked_name in ranked_names:
        for s in station_list:
            if ranked_name in s["name"].lower():
                matched.append(s)
                break
    return matched[:5]


In [None]:
def select_station_by_location_node(state: dict) -> dict:
    location = state.get("location")
    state_abbr = state.get("state")

    if not location or not state_abbr:
        # Fallback to default station IDs
        return {
            "station_ids": {
                "wind": "42020",
                "water": "8723970"
            }
        }

    # Filter station lists by state
    wind_stations = load_stations("wind_speed_stations.csv", state_abbr)
    water_stations = load_stations("water_level_stations.csv", state_abbr)

    # Ask LLM to rank stations by proximity to 'location'
    ranked_wind = ask_llm_rank_stations(location, wind_stations, "wind")
    ranked_water = ask_llm_rank_stations(location, water_stations, "water level")

    return {
        "station_ids": {
            "wind": ranked_wind[0]["station_id"],
            "water": ranked_water[0]["station_id"]
        },
        "station_candidates": {
            "wind": [s["station_id"] for s in ranked_wind],
            "water": [s["station_id"] for s in ranked_water]
        }
    }


In [None]:
stations = load_stations("water_level_stations.csv", "FL")

print("=== Filtered Stations ===")
for station in stations:
    print(station)

=== Filtered Stations ===
{'station_id': 8720030, 'name': 'Fernandina Beach, FL', 'location': 'Fernandina Beach', 'state': 'FL'}
{'station_id': 8720218, 'name': 'Mayport (Bar Pilots Dock), FL', 'location': 'Mayport (Bar Pilots Dock)', 'state': 'FL'}
{'station_id': 8720219, 'name': 'Dames Point, FL', 'location': 'Dames Point', 'state': 'FL'}
{'station_id': 8720226, 'name': 'Southbank Riverwalk, St Johns River, FL', 'location': 'Southbank Riverwalk, St Johns River', 'state': 'FL'}
{'station_id': 8720357, 'name': 'I-295 Buckman Bridge, FL', 'location': 'I-295 Buckman Bridge', 'state': 'FL'}
{'station_id': 8721604, 'name': 'Trident Pier, Port Canaveral, FL', 'location': 'Trident Pier, Port Canaveral', 'state': 'FL'}
{'station_id': 8722670, 'name': 'Lake Worth Pier, Atlantic Ocean, FL', 'location': 'Lake Worth Pier, Atlantic Ocean', 'state': 'FL'}
{'station_id': 8722956, 'name': 'South Port Everglades, FL', 'location': 'South Port Everglades', 'state': 'FL'}
{'station_id': 8723214, 'name': 

In [None]:
location_query = "Key West"
sensor = "wind"

ranked = ask_llm_rank_stations(location_query, stations, sensor)

print("\n=== Ranked Stations ===")
for s in ranked:
    print(s)

=== Top 5 Nearest Wind Station ===
1. Key West, FL
2. Vaca Key, Florida Bay, FL
3. Virginia Key, FL
4. South Port Everglades, FL
5. Trident Pier, Port Canaveral, FL


=== Ranked Stations ===
{'station_id': 8724580, 'name': 'Key West, FL', 'location': 'Key West', 'state': 'FL'}
{'station_id': 8723970, 'name': 'Vaca Key, Florida Bay, FL', 'location': 'Vaca Key, Florida Bay', 'state': 'FL'}
{'station_id': 8723214, 'name': 'Virginia Key, FL', 'location': 'Virginia Key', 'state': 'FL'}
{'station_id': 8722956, 'name': 'South Port Everglades, FL', 'location': 'South Port Everglades', 'state': 'FL'}
{'station_id': 8721604, 'name': 'Trident Pier, Port Canaveral, FL', 'location': 'Trident Pier, Port Canaveral', 'state': 'FL'}


In [None]:
import requests

In [None]:
def fetch_wind_speed(station_id: str) -> Optional[float]:
    url = "https://api.tidesandcurrents.noaa.gov/api/prod/datagetter"
    params = {
        "date": "latest",
        "station": station_id,
        "product": "wind",
        "units": "english",
        "time_zone": "gmt",
        "format": "json"
    }

    try:
        r = requests.get(url, params=params, timeout=5)
        if not r.ok:
            print(f"[{station_id}] Bad response: {r.status_code}")
            return None

        data = r.json()
        if "data" not in data or not data["data"]:
            print(f"[{station_id}] No 'data' in wind response")
            return None

        wind_speed = float(data["data"][0]["s"])
        return wind_speed
    except Exception as e:
        print(f"[{station_id}] Wind speed fetch error: {e}")
        return None


In [None]:
def fetch_water_level(station_id: str) -> Optional[float]:
    url = "https://api.tidesandcurrents.noaa.gov/api/prod/datagetter"
    params = {
        "date": "latest",
        "station": station_id,
        "product": "water_level",
        "datum": "MLLW",
        "units": "english",
        "time_zone": "gmt",
        "format": "json"
    }

    try:
        r = requests.get(url, params=params, timeout=5)
        r.raise_for_status()
        data = r.json()
        if "data" in data and data["data"]:
            return float(data["data"][0]["v"])
        print(f"[{station_id}] No data in response.")
        return None
    except Exception as e:
        print(f"[{station_id}] Water level fetch error: {e}")
        return None


In [None]:
def try_fetch_with_fallback(fetch_func, candidate_ids: list[str]) -> Optional[float]:
    for station_id in candidate_ids:
        result = fetch_func(station_id)
        if result is not None:
            return result
    return None


In [None]:
def fetch_environmental_data_node(state: dict) -> dict:
    stations = state["station_ids"]
    candidates = state.get("station_candidates", {})

    wind = try_fetch_with_fallback(fetch_wind_speed, candidates.get("wind", [stations["wind"]]))
    water = try_fetch_with_fallback(fetch_water_level, candidates.get("water", [stations["water"]]))

    return {
        "environmental_data": {
            "wind_speed": wind,
            "water_level": water
        }
    }


In [None]:
# Known tide station
print(fetch_wind_speed("8723970"))

9.52


In [None]:
import ee
import datetime

ee.Authenticate()

# Initialize with your project
ee.Initialize(project='ee-lgharijanto123')

def get_cleaned_weekly_ndvi_series(lat: float, lon: float, days_back: int = 70) -> list[float]:
    point = ee.Geometry.Point([lon, lat])
    study_area = point.buffer(16000)

    today = datetime.date.today()
    start = ee.Date(today.strftime('%Y-%m-%d')).advance(-days_back, 'day')

    def add_ndvi(image):
        ndvi = image.normalizedDifference(['sur_refl_b02', 'sur_refl_b01']).rename('NDVI')
        return image.addBands(ndvi)

    def extract_mean(image):
        mean = image.select('NDVI').reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=study_area,
            scale=500,
            maxPixels=1e9,
            bestEffort=True
        ).get('NDVI')
        return ee.Feature(None, {'ndvi': mean})

    ndvi_series = (
        ee.ImageCollection('MODIS/061/MOD09GA')
        .filterBounds(study_area)
        .filterDate(start, ee.Date(today.strftime('%Y-%m-%d')))
        .map(add_ndvi)
        .map(extract_mean)
        .filter(ee.Filter.notNull(['ndvi']))
    )

    ndvi_list = ndvi_series.aggregate_array('ndvi').getInfo()
    ndvi_floats = [v / 10000 if v > 1 else v for v in ndvi_list]

    if len(ndvi_floats) < 56:
        raise ValueError(f"Not enough data. Got {len(ndvi_floats)} daily values, need at least 56.")

    weekly_ndvi = []
    for i in range(0, len(ndvi_floats) - 56 + 56, 7):
        chunk = ndvi_floats[i:i+7]
        if len(chunk) == 7:
            weekly_ndvi.append(sum(chunk) / 7)

    if len(weekly_ndvi) < 8:
        raise ValueError(f"Only {len(weekly_ndvi)} weekly values found, need 8.")

    return weekly_ndvi[-8:]


In [None]:
vals = get_cleaned_weekly_ndvi_series(25.9928, -81.3923)  # Pumpkin River
print(vals)

[0.4546419024603802, 0.5214106775183016, 0.32694591497867, 0.3648745033678752, 0.450423371305643, 0.5419387288275995, 0.5322891062851883, 0.3217713250057927]


In [None]:
def fetch_weekly_noaa_lags_chunked(station_id: str, product: str, total_days: int = 56) -> list[float]:
    from datetime import datetime, timedelta
    import requests
    import pandas as pd

    end = datetime.utcnow().date()
    start = end - timedelta(days=total_days)

    # Break range into 3 chunks (e.g., 18 + 19 + 19 days)
    chunk_starts = [start + timedelta(days=i * 18) for i in range(3)]
    chunk_ends = [min(s + timedelta(days=18), end) for s in chunk_starts]

    all_dfs = []

    for chunk_start, chunk_end in zip(chunk_starts, chunk_ends):
        params = {
            "begin_date": chunk_start.strftime("%Y%m%d"),
            "end_date": chunk_end.strftime("%Y%m%d"),
            "station": station_id,
            "product": product,
            "datum": "MLLW" if product == "water_level" else None,
            "interval": "h",  # Hourly granularity
            "units": "english",
            "time_zone": "gmt",
            "format": "json"
        }

        try:
            r = requests.get("https://api.tidesandcurrents.noaa.gov/api/prod/datagetter",
                             params={k: v for k, v in params.items() if v is not None},
                             timeout=15)
            r.raise_for_status()
            data = r.json().get("data", [])
            if not data:
                continue

            df = pd.DataFrame(data)
            df["t"] = pd.to_datetime(df["t"])
            df.set_index("t", inplace=True)

            value_col = "s" if product == "wind" else "v"
            df[value_col] = pd.to_numeric(df[value_col], errors="coerce")

            all_dfs.append(df)

        except Exception as e:
            print(f"Chunk {chunk_start}–{chunk_end} failed: {e}")

    if not all_dfs:
        print(f"No valid data collected for {product}.")
        return []

    full_df = pd.concat(all_dfs).sort_index()

    # Resample into weekly means (week ends Sunday by default)
    value_col = "s" if product == "wind" else "v"
    weekly = full_df[value_col].resample("W").mean().dropna()

    if len(weekly) < 8:
        print(f"Only {len(weekly)} weekly values found for {product}, expected 8.")
        return weekly.tolist()  # Return whatever we have

    return weekly.tail(8).tolist()


In [None]:
wind_lags = fetch_weekly_noaa_lags_chunked("8724580", "wind")
water_lags = fetch_weekly_noaa_lags_chunked("8724580", "water_level")

print("WIND LAGS:", wind_lags)
print("WATER LAGS:", water_lags)

WIND LAGS: [8.901309523809525, 8.817380952380953, 9.440729166666667, 8.936190476190475, 8.258802083333334, 9.368333333333332, 8.63452380952381, 4.536041666666667]
WATER LAGS: [0.9006303571428571, 0.8182613095238096, 1.23991875, 1.1934892857142856, 1.0897671875000001, 0.9026880952380952, 1.023640476190476, 1.1889354166666666]


In [None]:
import pandas as pd

def build_feature_vector_node(state: dict) -> dict:
    try:
        # Fetch 8 weeks (current + 7 lags)
        wind_vals = fetch_weekly_noaa_lags_chunked(state["station_ids"]["wind"], "wind")
        water_vals = fetch_weekly_noaa_lags_chunked(state["station_ids"]["water"], "water_level")
        ndvi_vals = get_cleaned_weekly_ndvi_series(*state["gps"])

        if len(wind_vals) < 8 or len(water_vals) < 8 or len(ndvi_vals) < 8:
            print("Insufficient weekly data.")
            return {"feature_vector": [], "feature_df": None}

        # Build DataFrame (already in chronological order: oldest → newest)
        df = pd.DataFrame({
            "tide_verified": water_vals,
            "wind_speed": wind_vals,
            "ndvi": ndvi_vals
        })

        # Add artificial weekly dates (most recent week = today)
        base_date = pd.to_datetime("today").normalize()
        df["date"] = [base_date - pd.Timedelta(weeks=i) for i in reversed(range(len(df)))]
        df.set_index("date", inplace=True)

        # Add lag features (1–7)
        for col in ["tide_verified", "wind_speed", "ndvi"]:
            for lag in range(1, 8):
                df[f"{col}_lag_{lag}"] = df[col].shift(lag)

        # Drop NaNs → only final row will be complete
        latest_row = df.dropna().iloc[-1]

        # Select 21 lag features in model training order
        # Get current values first
        features = latest_row[["tide_verified", "wind_speed"]].tolist()

        # Add lags
        features += latest_row[
            [f"{col}_lag_{i}" for col in ["tide_verified", "wind_speed", "ndvi"] for i in range(1, 8)]
        ].tolist()

        print("[feature vector]", features)

        return {
            "feature_vector": features,
            "feature_df": df
        }

    except Exception as e:
        print("Feature vector error:", e)
        return {"feature_vector": [], "feature_df": None}


In [None]:
import joblib
import numpy as np

# Load once at the top level (recommended)
xgb_model = joblib.load("xgboost.pkl")
scaler = joblib.load("scaler.pkl")

def predict_ndvi_node(state: dict) -> dict:
    try:
        features = state.get("feature_vector")
        if not features or len(features) != 23:
            print("Invalid or missing feature vector")
            return {"ndvi_prediction": None}

        X = pd.DataFrame([features], columns=scaler.feature_names_in_)
        # print("Raw input:", X)

        # Standardize using the saved training scaler
        X_scaled = scaler.transform(X)
        # print("Scaled input:", X_scaled)

        pred = xgb_model.predict(X_scaled)[0]
        print("NDVI Prediction:", pred)

        return {"ndvi_prediction": float(pred)}

    except Exception as e:
        print("Prediction error:", e)
        return {"ndvi_prediction": None}


In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import Runnable

summary_prompt = ChatPromptTemplate.from_template("""
You are a coastal ecology assistant.

Given the following data:
- User question: {user_query}
- NDVI prediction: {ndvi_prediction}
- Wind speed: {wind_speed}
- Water level: {water_level}

Generate a clear, concise 5-6 sentence summary of the mangrove condition and any notable environmental trends. Always include all the numbers provided.
""")

summary_chain: Runnable = summary_prompt | llm  # assumes your LLM is available

def generate_summary_node(state: State) -> dict:
    try:
        return {
            "summary": summary_chain.invoke({
                "user_query": state["user_query"],
                "ndvi_prediction": state["ndvi_prediction"],
                "wind_speed": state["environmental_data"]["wind_speed"],
                "water_level": state["environmental_data"]["water_level"]
            })
        }
    except Exception as e:
        print("[summary] Error:", e)
        return {"summary": "Unable to generate summary."}

In [None]:
def resolve_gps_from_location_node(state: dict) -> dict:
    location = state.get("location")
    state_abbr = state.get("state")

    print(f"[resolve_gps] location={location}, state={state_abbr}")

    if not location:
        print("[resolve_gps] Missing location.")
        return {"gps": None}

    try:
        query = location if not state_abbr else f"{location}, {state_abbr}"
        lat, lon = geocode_location(query)
        print(f"[resolve_gps] Geocoded → {lat}, {lon}")
        return {"gps": (lat, lon)}
    except Exception as e:
        print("[resolve_gps] Error:", e)
        return {"gps": None}


In [None]:
def fetch_ndvi_lags_node(state: dict) -> dict:
    gps = state.get("gps")
    if not gps:
        print("[ndvi lags] GPS missing")
        return {"ndvi_lags": []}
    lat, lon = gps
    ndvi_lags = get_cleaned_weekly_ndvi_series(lat, lon)[:7]
    print("[ndvi lags]", ndvi_lags)
    return {"ndvi_lags": ndvi_lags}


In [None]:
def fetch_weekly_lags_node(state: dict) -> dict:
    stations = state.get("station_ids", {})
    wind = fetch_weekly_noaa_lags_chunked(stations.get("wind"), "wind")
    water = fetch_weekly_noaa_lags_chunked(stations.get("water"), "water_level")
    print("[wind lags]", wind)
    print("[water lags]", water)
    return {"wind_lags": wind, "water_lags": water}


In [None]:
from typing import TypedDict, Optional, Tuple, Dict, List

class State(TypedDict):
    # === User input & intent extraction ===
    user_query: str                             # Always provided by the user
    goal: Optional[str]                         # 'forecast' or 'research'
    location: Optional[str]                     # e.g. "Key West"
    state: Optional[str]                        # e.g. "FL"

    # === Location resolution ===
    gps: Optional[Tuple[float, float]]          # (lat, lon)

    # === Station selection ===
    station_ids: Optional[Dict[str, str]]       # {wind: id, water: id}
    station_candidates: Optional[Dict[str, List[str]]]  # fallback ids

    # === Real-time fetches
    environmental_data: Optional[Dict[str, float]]  # {'wind_speed': val, 'water_level': val}

    # === Weekly history (optional: for inspection only)
    wind_lags: Optional[List[float]]
    water_lags: Optional[List[float]]
    ndvi_lags: Optional[List[float]]


    # === Feature engineering ===
    feature_vector: Optional[List[float]]       # Final model input
    feature_df: Optional["pd.DataFrame"]        # Optional for debugging

    # === Model output ===
    ndvi_prediction: Optional[float]
    summary: Optional[str]


In [None]:
from langgraph.graph import StateGraph, END

# Initialize your graph
workflow = StateGraph(State)

# Register all nodes before compile
workflow.add_node("extract_intent", extract_intent_node)
workflow.add_node("select_stations", select_station_by_location_node)
workflow.add_node("fetch_environmental_data", fetch_environmental_data_node)
workflow.add_node("resolve_gps", resolve_gps_from_location_node)
workflow.add_node("fetch_ndvi_lags", fetch_ndvi_lags_node)
workflow.add_node("fetch_weekly_lags", fetch_weekly_lags_node)
workflow.add_node("build_feature_vector", build_feature_vector_node)
workflow.add_node("predict_ndvi", predict_ndvi_node)
workflow.add_node("generate_summary", generate_summary_node)

# Connect the nodes
workflow.set_entry_point("extract_intent")
workflow.add_edge("extract_intent", "select_stations")
workflow.add_edge("select_stations", "fetch_environmental_data")
workflow.add_edge("select_stations", "resolve_gps")
workflow.add_edge("select_stations", "fetch_weekly_lags")
workflow.add_edge("resolve_gps", "fetch_ndvi_lags")
workflow.add_edge("fetch_weekly_lags", "build_feature_vector")
workflow.add_edge("fetch_ndvi_lags", "build_feature_vector")
workflow.add_edge("build_feature_vector", "predict_ndvi")
workflow.add_edge("predict_ndvi", "generate_summary")
workflow.add_edge("generate_summary", END)

# Finalize the graph
app = workflow.compile()

In [None]:
result = app.invoke({"user_query": "How are mangroves doing in Key West?"})

=== Top 5 Nearest Wind Station ===
1. Vaca Key, Florida Bay, FL
2. Key West, FL
3. Fort Myers, FL
4. Port Manatee, FL
5. Middle Tampa Bay, FL

=== Top 5 Nearest Water Level Station ===
1. Key West, FL
2. Vaca Key, Florida Bay, FL
3. Virginia Key, FL
4. South Port Everglades, FL
5. Trident Pier, Port Canaveral, FL

[resolve_gps] location=Key West, state=FL
[resolve_gps] Geocoded → 24.5548262, -81.8020722
[wind lags] [6.448095238095238, 7.748333333333333, 6.576614583333334, 7.612083333333333, 6.287239583333334, 7.427142857142857, 6.8218452380952375, 3.656875]
[water lags] [0.9006303571428571, 0.8182613095238096, 1.23991875, 1.1934892857142856, 1.0897671875000001, 0.9026880952380952, 1.023640476190476, 1.1889354166666666]
[ndvi lags] [-0.29172347926953784, -0.2507707741115039, -0.39723508979789457, -0.17780758686841117, -0.16835179260007754, -0.3476457947141554, -0.3137031269951354]
[feature vector] [1.1889354166666666, 3.656875, 1.023640476190476, 0.9026880952380952, 1.0897671875000001, 

In [None]:
print(result['summary'].content)

In Key West, the current NDVI prediction for mangroves stands at 0.41, indicating moderate vegetation health. The wind speed is 9.52 mph, and the water level is at 2.052 feet. These conditions suggest stable, but not optimal, growth for mangroves, with environmental factors remaining within a typical range.
