In [2]:
import geopandas as gpd
import pandas as pd 
from shapely import wkt
import numpy as np

original_df = gpd.read_file("WildflowerBlooms_AreaOfInterest.geojson")
bloom_df = pd.read_csv('bloomwatch_full_timeseries_2020_2025.csv')
non_bloom_before_df = pd.read_csv('non_bloomwatch_full_timeseries_2020_2025_before.csv')
non_bloom_after_df = pd.read_csv('non_bloomwatch_full_timeseries_2020_2025_after.csv')
non_site_df = pd.read_csv('non_bloom_sites_timeseries_2020_2025.csv')
original_df

Unnamed: 0,id,Site,Type,Season,Area,geometry
0,,Chino Hills,Wild,Spring,25382.057,"MULTIPOLYGON (((-117.68286 33.87223, -117.7196..."
1,,Carrizo Plain National Monument,Wild,Spring,354751.514,"MULTIPOLYGON (((-119.50857 34.87044, -119.6043..."
2,,Antelope Valley California Poppy Reserve,Wild,Spring,25182.333,"MULTIPOLYGON (((-118.50267 34.69779, -118.5023..."
3,,Laura and Jack Dangermond Reserve,Wild,Spring,9027.693,"MULTIPOLYGON (((-120.46129 34.56679, -120.4353..."
4,,Sedgwick Reserve,Wild,Spring,9511.6,"MULTIPOLYGON (((-119.95777 34.73243, -119.9831..."
5,,Anza-Borrego Desert State Park,Wild,Spring,190599.381,"MULTIPOLYGON (((-116.29727 33.17188, -116.2469..."
6,,Montaña de Oro State Park,Wild,Spring,5242.552,"MULTIPOLYGON (((-120.82595 35.22122, -120.7607..."
7,,Figueroa Mountain,Wild,Spring,30886.21,"MULTIPOLYGON (((-119.7979 34.7594, -119.71634 ..."
8,,Red Hills Recreational Managment Area,Wild,Spring,29205.903,"MULTIPOLYGON (((-120.46038 37.9258, -120.40525..."
9,,Jepson Prairie Preserve,Wild,Spring,9820.795,"MULTIPOLYGON (((-121.80033 38.30892, -121.7543..."


In [3]:
#Concatenate the before and after nonbloom
#Find the coordinates of all 

non_bloom_df = pd.concat([non_bloom_before_df,non_bloom_after_df], ignore_index=True)
non_bloom_df

Unnamed: 0,id,Site,Type,date,year,NDVI,NDWI,cloud_cover
0,20,San Carlos Reservation,Wild,2020-01-15,2020,0.111064,-0.135954,56.70
1,20,San Carlos Reservation,Wild,2020-01-31,2020,0.127861,-0.149734,1.78
2,20,San Carlos Reservation,Wild,2020-02-16,2020,0.137321,-0.160113,6.13
3,18,Picacho Peak State Park,Wild,2020-01-06,2020,0.130126,-0.155960,1.47
4,20,San Carlos Reservation,Wild,2020-01-06,2020,0.108433,-0.136692,1.47
...,...,...,...,...,...,...,...,...
6632,3,Bear Valley Wildflower,Wild,2025-07-21,2025,0.170254,-0.223696,62.03
6633,3,Bear Valley Wildflower,Wild,2025-08-06,2025,0.134574,-0.175650,50.22
6634,3,Bear Valley Wildflower,Wild,2025-08-22,2025,0.149585,-0.207549,41.37
6635,3,Bear Valley Wildflower,Wild,2025-09-07,2025,0.218527,-0.325255,66.17


In [4]:
def get_centroid_from_wkt(wkt_string):
    try:
        geom = wkt.loads(wkt_string)
        centroid = geom.centroid
        return centroid.x, centroid.y
    except:
        return np.nan, np.nan
    
original_df['geometry_wkt'] = original_df.geometry.to_wkt()

# Then, use your function on the new WKT string column
original_df[['longitude', 'latitude']] = original_df['geometry_wkt'].apply(
    lambda wkt_str: pd.Series(get_centroid_from_wkt(wkt_str))
)

site_coordinate_map = {}
for _, row in original_df.iterrows():
    lon, lat = get_centroid_from_wkt(row['geometry_wkt'])
    site_coordinate_map[row['Site']] = (lon, lat)

print("Site coordinate mapping:")
print(site_coordinate_map)

Site coordinate mapping:
{'Chino Hills': (-117.77741722644762, 33.95166982313173), 'Carrizo Plain National Monument': (-119.84188601188094, 35.14694248226192), 'Antelope Valley California Poppy Reserve': (-118.38375469959021, 34.71508861411871), 'Laura and Jack Dangermond Reserve': (-120.4499101041053, 34.509933096785765), 'Sedgwick Reserve': (-120.02672551703671, 34.71413309520678), 'Anza-Borrego Desert State Park': (-116.17218600715876, 32.9601360505044), 'Montaña de Oro State Park': (-120.81383400040781, 35.20683027405567), 'Figueroa Mountain': (-119.75271879837472, 34.69766407453087), 'Red Hills Recreational Managment Area': (-120.47149701124106, 37.847251852339554), 'Jepson Prairie Preserve': (-121.82979211875876, 38.26395459392311), 'Bear Valley Wildflower': (-122.42961255408602, 39.122353957117646), 'North Table Mountain Ecological Reserve': (-121.55597836570155, 39.5934372299187), 'Channel Islands National Park': (-119.97285014044256, 33.998418819119095), 'Black Canyon City': (

In [5]:
for index, row in bloom_df.iterrows():
    if row["Site"] in site_coordinate_map:
        bloom_df.at[index, "longitude"] = site_coordinate_map[row['Site']][0]
        bloom_df.at[index, "latitude"] = site_coordinate_map[row['Site']][1]

for index, row in non_bloom_df.iterrows():
    if row["Site"] in site_coordinate_map:
        non_bloom_df.at[index, "longitude"] = site_coordinate_map[row['Site']][0]
        non_bloom_df.at[index, "latitude"] = site_coordinate_map[row['Site']][1]    



In [6]:
bloom_df = bloom_df.drop("Type",axis=1)
non_bloom_df= non_bloom_df.drop("Type",axis=1)
bloom_df = bloom_df.drop("id",axis=1)
non_bloom_df= non_bloom_df.drop("id",axis=1)
non_site_df= non_site_df.drop("Type",axis=1)
non_site_df= non_site_df.drop("location_type",axis=1)
non_site_df =non_site_df.drop("id", axis=1)

In [16]:
bloom_df['bloom'] = 1
non_bloom_df['bloom'] = 0
non_site_df['bloom'] = 0

# Combine all datasets
training_dataset = pd.concat([bloom_df, non_bloom_df, non_site_df], ignore_index=True)

# ENHANCEMENT: Add temporal features
training_dataset['date'] = pd.to_datetime(training_dataset['date'])
training_dataset['month'] = training_dataset['date'].dt.month
training_dataset['day_of_year'] = training_dataset['date'].dt.dayofyear
training_dataset['year'] = training_dataset['date'].dt.year
training_dataset['day_of_month'] = training_dataset['date'].dt.day

# Seasonal flags
training_dataset['is_spring'] = training_dataset['month'].isin([3, 4, 5]).astype(int)  # March-May
training_dataset['is_summer'] = training_dataset['month'].isin([6, 7, 8]).astype(int)  # June-August
training_dataset['is_fall'] = training_dataset['month'].isin([9, 10, 11]).astype(int)  # September-November
training_dataset['is_winter'] = training_dataset['month'].isin([12, 1, 2]).astype(int) # December-February

print("Enhanced dataset ready!")
print(f"Dataset shape: {training_dataset.shape}")
print(f"Columns: {training_dataset.columns.tolist()}")
print(f"Bloom distribution: {training_dataset['bloom'].value_counts()}")
print("\nSample data:")
print(training_dataset[['Site', 'date', 'month', 'day_of_year', 'NDVI', 'bloom']].head(10))
training_dataset_ml = training_dataset.drop('date', axis=1)
print(f"ML-ready dataset shape: {training_dataset_ml.shape}")
training_dataset_ml.to_csv("training_dataset_ml.csv", index=False) 

Enhanced dataset ready!
Dataset shape: (11401, 16)
Columns: ['Site', 'date', 'year', 'NDVI', 'NDWI', 'cloud_cover', 'longitude', 'latitude', 'bloom', 'month', 'day_of_year', 'day_of_month', 'is_spring', 'is_summer', 'is_fall', 'is_winter']
Bloom distribution: bloom
0    9055
1    2346
Name: count, dtype: int64

Sample data:
                               Site       date  month  day_of_year      NDVI  \
0            San Carlos Reservation 2020-03-03      3           63  0.150269   
1            San Carlos Reservation 2020-03-19      3           79  0.225451   
2            San Carlos Reservation 2020-04-04      4           95  0.167870   
3            San Carlos Reservation 2020-04-20      4          111  0.155614   
4            San Carlos Reservation 2020-05-06      5          127  0.143641   
5            San Carlos Reservation 2020-05-22      5          143  0.135595   
6           Picacho Peak State Park 2020-04-27      4          118  0.096502   
7            San Carlos Reservatio