In [1]:
import pandas as pd
import os

In [2]:
merged = pd.read_csv('../data/processed/training_data.csv')

unique_checklists = merged.groupby(['date', 'lat_rounded', 'lon_rounded']).ngroups
print(f"Unique checklists: {unique_checklists}")
print(f"Full dataset would be: {unique_checklists * 488:,} rows")

Unique checklists: 21752
Full dataset would be: 10,614,976 rows


In [11]:
# Get all unique species
unique_species = merged['COMMON NAME'].unique()

# Get unique checklists (date + location + weather)
checklists = merged.groupby(['date', 'lat_rounded', 'lon_rounded']).agg({
    'COMMON NAME': list,
    'temperature_2m_mean': 'first',
    'relative_humidity_2m_mean': 'first',
    'cloud_cover_mean': 'first',
    'precipitation_sum': 'first',
    'rain_sum': 'first',
    'wind_gusts_10m_mean': 'first',
    'wind_speed_10m_mean': 'first'
}).reset_index()

In [12]:
print(checklists.head())
print(len(checklists))

         date  lat_rounded  lon_rounded  \
0  2025-01-01         36.1         -5.7   
1  2025-01-01         36.1         -5.6   
2  2025-01-01         36.1         -5.5   
3  2025-01-01         36.2         -6.0   
4  2025-01-01         36.2         -5.9   

                                         COMMON NAME  temperature_2m_mean  \
0  [Common Chiffchaff, Common Sandpiper, Common R...            11.896000   
1  [Eurasian Blackcap, Black Redstart, Common Chi...            12.761582   
2  [Eurasian Blackcap, Eurasian Blackcap, Eurasia...            13.400002   
3                  [Black-winged Kite, Corn Bunting]            11.109501   
4  [Audouin's Gull, Barn Swallow, Black-bellied P...            11.885417   

   relative_humidity_2m_mean  cloud_cover_mean  precipitation_sum  rain_sum  \
0                   71.84383         14.666667                0.0       0.0   
1                   74.26594         25.791666                0.0       0.0   
2                   70.52885         32.6

In [13]:
rows = []

for _, checklist in checklists.iterrows():

    species_seen = checklist['COMMON NAME']

    for species in unique_species:

        row = {
            'date': checklist['date'],
            'lat_rounded': checklist['lat_rounded'],
            'lon_rounded': checklist['lon_rounded'],
            'temperature_2m_mean': checklist['temperature_2m_mean'],
            'relative_humidity_2m_mean': checklist['relative_humidity_2m_mean'],
            'cloud_cover_mean': checklist['cloud_cover_mean'],
            'precipitation_sum': checklist['precipitation_sum'],
            'rain_sum': checklist['rain_sum'],
            'wind_gusts_10m_mean': checklist['wind_gusts_10m_mean'],
            'wind_speed_10m_mean': checklist['wind_speed_10m_mean'],
            'species': species,
            'seen': 1 if species in species_seen else 0
        }
        rows.append(row)

training_df = pd.DataFrame(rows)

In [14]:
print(len(training_df))
print(training_df.head())
print(training_df['seen'].value_counts())

10614976
         date  lat_rounded  lon_rounded  temperature_2m_mean  \
0  2025-01-01         36.1         -5.7               11.896   
1  2025-01-01         36.1         -5.7               11.896   
2  2025-01-01         36.1         -5.7               11.896   
3  2025-01-01         36.1         -5.7               11.896   
4  2025-01-01         36.1         -5.7               11.896   

   relative_humidity_2m_mean  cloud_cover_mean  precipitation_sum  rain_sum  \
0                   71.84383         14.666667                0.0       0.0   
1                   71.84383         14.666667                0.0       0.0   
2                   71.84383         14.666667                0.0       0.0   
3                   71.84383         14.666667                0.0       0.0   
4                   71.84383         14.666667                0.0       0.0   

   wind_gusts_10m_mean  wind_speed_10m_mean                    species  seen  
0                33.48            15.844834         

In [15]:
training_df.to_csv('../data/processed/training_data.csv', index=False)
print("Saved!")

Saved!


In [16]:
training_df['date'] = pd.to_datetime(training_df['date'])
training_df['day_of_year'] = training_df['date'].dt.dayofyear
training_df['month'] = training_df['date'].dt.month

In [17]:
print(training_df[['date', 'day_of_year', 'month']].head())

        date  day_of_year  month
0 2025-01-01            1      1
1 2025-01-01            1      1
2 2025-01-01            1      1
3 2025-01-01            1      1
4 2025-01-01            1      1


In [18]:
training_df.to_csv('../data/processed/training_data.csv', index=False)
print(f"Saved {len(training_df)} rows")

Saved 10614976 rows
