## Loading the Dataset sample.

In [60]:
import pandas as pd

In [61]:
bird_df = pd.read_csv("../data/raw/sample.csv", delimiter="\t")  # eBird uses tabs
print(bird_df.shape)
print(bird_df.columns.tolist())
bird_df.head()

(4863, 53)
['GLOBAL UNIQUE IDENTIFIER', 'LAST EDITED DATE', 'TAXONOMIC ORDER', 'CATEGORY', 'TAXON CONCEPT ID', 'COMMON NAME', 'SCIENTIFIC NAME', 'SUBSPECIES COMMON NAME', 'SUBSPECIES SCIENTIFIC NAME', 'EXOTIC CODE', 'OBSERVATION COUNT', 'BREEDING CODE', 'BREEDING CATEGORY', 'BEHAVIOR CODE', 'AGE/SEX', 'COUNTRY', 'COUNTRY CODE', 'STATE', 'STATE CODE', 'COUNTY', 'COUNTY CODE', 'IBA CODE', 'BCR CODE', 'USFWS CODE', 'ATLAS BLOCK', 'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'TIME OBSERVATIONS STARTED', 'OBSERVER ID', 'OBSERVER ORCID ID', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'PROTOCOL NAME', 'PROTOCOL CODE', 'PROJECT NAMES', 'PROJECT IDENTIFIERS', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'EFFORT AREA HA', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER', 'HAS MEDIA', 'APPROVED', 'REVIEWED', 'REASON', 'CHECKLIST COMMENTS', 'SPECIES COMMENTS', 'Unnamed: 52']


Unnamed: 0,GLOBAL UNIQUE IDENTIFIER,LAST EDITED DATE,TAXONOMIC ORDER,CATEGORY,TAXON CONCEPT ID,COMMON NAME,SCIENTIFIC NAME,SUBSPECIES COMMON NAME,SUBSPECIES SCIENTIFIC NAME,EXOTIC CODE,...,NUMBER OBSERVERS,ALL SPECIES REPORTED,GROUP IDENTIFIER,HAS MEDIA,APPROVED,REVIEWED,REASON,CHECKLIST COMMENTS,SPECIES COMMENTS,Unnamed: 52
0,URN:CornellLabOfOrnithology:EBIRD:OBS2919749158,2025-03-01 23:25:39.781016,21333,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,...,2.0,1,G14142220,0,1,0,,,,
1,URN:CornellLabOfOrnithology:EBIRD:OBS2933256785,2025-03-05 07:43:54.669907,21333,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,...,1.0,1,,0,1,0,,,,
2,URN:CornellLabOfOrnithology:EBIRD:OBS3009418896,2025-03-30 17:15:29.683739,21333,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,...,2.0,1,G14355008,0,1,0,,,,
3,URN:CornellLabOfOrnithology:EBIRD:OBS2930040857,2025-03-04 07:42:07.176854,21333,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,...,1.0,1,,0,1,0,,,,
4,URN:CornellLabOfOrnithology:EBIRD:OBS3005325331,2025-03-29 12:31:48.25322,21333,species,avibase-69544B59,American Crow,Corvus brachyrhynchos,,,,...,1.0,1,,0,1,0,,,,


## Narrowing down to potentially useful columns.


In [62]:
bird_df[["COMMON NAME", "SCIENTIFIC NAME", "OBSERVATION COUNT", "LATITUDE", "LONGITUDE", "TIME OBSERVATIONS STARTED", "OBSERVATION TYPE", "DURATION MINUTES", "EFFORT DISTANCE KM"]]

Unnamed: 0,COMMON NAME,SCIENTIFIC NAME,OBSERVATION COUNT,LATITUDE,LONGITUDE,TIME OBSERVATIONS STARTED,OBSERVATION TYPE,DURATION MINUTES,EFFORT DISTANCE KM
0,American Crow,Corvus brachyrhynchos,2,33.155957,-87.633699,06:21:00,Traveling,24.0,1.092
1,American Crow,Corvus brachyrhynchos,1,33.188274,-87.539672,06:27:00,Stationary,15.0,
2,American Crow,Corvus brachyrhynchos,3,33.133521,-87.653418,15:11:00,Traveling,60.0,13.532
3,American Crow,Corvus brachyrhynchos,2,33.188274,-87.539672,06:24:00,Stationary,16.0,
4,American Crow,Corvus brachyrhynchos,4,33.425151,-87.605487,10:14:00,Stationary,48.0,
...,...,...,...,...,...,...,...,...,...
4858,Yellow-throated Warbler,Setophaga dominica,1,33.189768,-87.324228,11:26:00,Stationary,6.0,
4859,Yellow-throated Warbler,Setophaga dominica,2,33.222166,-87.599165,09:34:00,Traveling,56.0,1.207
4860,Yellow-throated Warbler,Setophaga dominica,2,33.108221,-87.650768,08:44:00,Traveling,43.0,7.854
4861,Yellow-throated Warbler,Setophaga dominica,25,33.018162,-87.368288,06:52:00,Traveling,130.0,10.578


## Exploring nulls
Found nulls in DURATION MINUTES (514) and EFFORT DISTANCE KM (1562).
These are expected—stationary protocols don't have distance, incidental don't have either.


In [63]:
bird_df[["COMMON NAME", "SCIENTIFIC NAME", "OBSERVATION COUNT", "LATITUDE", "LONGITUDE", "TIME OBSERVATIONS STARTED", "OBSERVATION TYPE", "DURATION MINUTES", "EFFORT DISTANCE KM"]].isnull().sum()

COMMON NAME                     0
SCIENTIFIC NAME                 0
OBSERVATION COUNT               0
LATITUDE                        0
LONGITUDE                       0
TIME OBSERVATIONS STARTED       0
OBSERVATION TYPE                0
DURATION MINUTES              514
EFFORT DISTANCE KM           1562
dtype: int64

## Is it useful to keep Observation Type/Count ?

Type can matter to rank the difficulty of observation.
The goal of the project is to know whether a bird can be observed or not, its count is not that important for a first version.

In [64]:
bird_df["OBSERVATION TYPE"].unique()

array(['Traveling', 'Stationary', 'Incidental'], dtype=object)

In [65]:
bird_df["OBSERVATION COUNT"].value_counts()

OBSERVATION COUNT
1      1898
2       965
3       475
4       354
5       232
6       172
8       127
10       80
7        72
15       50
12       48
20       44
50       35
9        33
13       32
X        29
25       28
30       21
11       17
40       15
35       15
75       13
14       11
16       11
18       11
100      10
60        9
21        7
17        6
22        6
120       5
300       3
27        3
200       3
150       2
32        2
55        2
65        2
51        2
19        2
80        2
42        2
58        1
52        1
26        1
45        1
500       1
23        1
24        1
Name: count, dtype: int64

## Negative observation

To train a model, we need to know if a Bird is also not there, as the absence means something in our training.
For "ALL SPECIES REPORTED" we can see that around 90% of the observation are equal to 1. We'll need to mark the absences as well.

In [66]:
bird_df["ALL SPECIES REPORTED"].value_counts()

ALL SPECIES REPORTED
1    4322
0     541
Name: count, dtype: int64

## Exploration of the main dataset that will be used for the model training.

We have to make sure the actual dataset has the same structure as the sample used in the first place.

In [67]:
bird_df = pd.read_csv("../data/raw/ebird_spain_2020-2025.txt", delimiter="\t", nrows=100)
print(bird_df.shape)
print(bird_df.columns.tolist())

(100, 53)
['GLOBAL UNIQUE IDENTIFIER', 'LAST EDITED DATE', 'TAXONOMIC ORDER', 'CATEGORY', 'TAXON CONCEPT ID', 'COMMON NAME', 'SCIENTIFIC NAME', 'SUBSPECIES COMMON NAME', 'SUBSPECIES SCIENTIFIC NAME', 'EXOTIC CODE', 'OBSERVATION COUNT', 'BREEDING CODE', 'BREEDING CATEGORY', 'BEHAVIOR CODE', 'AGE/SEX', 'COUNTRY', 'COUNTRY CODE', 'STATE', 'STATE CODE', 'COUNTY', 'COUNTY CODE', 'IBA CODE', 'BCR CODE', 'USFWS CODE', 'ATLAS BLOCK', 'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'TIME OBSERVATIONS STARTED', 'OBSERVER ID', 'OBSERVER ORCID ID', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'PROTOCOL NAME', 'PROTOCOL CODE', 'PROJECT NAMES', 'PROJECT IDENTIFIERS', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'EFFORT AREA HA', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER', 'HAS MEDIA', 'APPROVED', 'REVIEWED', 'REASON', 'CHECKLIST COMMENTS', 'SPECIES COMMENTS', 'Unnamed: 52']


In [68]:
with open("../data/raw/ebird_spain_2020-2025.txt", "r") as f:
    row_count = sum(1 for _ in f)
print(f"Total rows: {row_count:,}")

Total rows: 31,863,328


In [69]:
cols_to_keep = [
    "COMMON NAME",
    "SCIENTIFIC NAME",
    "LATITUDE",
    "LONGITUDE",
    "OBSERVATION DATE",
    "TIME OBSERVATIONS STARTED",
    "OBSERVATION TYPE",
    "DURATION MINUTES",
    "EFFORT DISTANCE KM",
    "ALL SPECIES REPORTED",
    "SAMPLING EVENT IDENTIFIER"
]

bird_df = pd.read_csv(
    "../data/raw/ebird_spain_2020-2025.txt",
    delimiter="\t",
    usecols=cols_to_keep,
    nrows=1000000
)
print(bird_df.shape)
bird_df.head()

(1000000, 11)


Unnamed: 0,COMMON NAME,SCIENTIFIC NAME,LATITUDE,LONGITUDE,OBSERVATION DATE,TIME OBSERVATIONS STARTED,SAMPLING EVENT IDENTIFIER,OBSERVATION TYPE,DURATION MINUTES,EFFORT DISTANCE KM,ALL SPECIES REPORTED
0,Alpine Accentor,Prunella collaris,37.221562,-2.548854,2020-01-02,11:10:00,S62913350,Stationary,80.0,,1
1,Alpine Accentor,Prunella collaris,37.221562,-2.548854,2020-01-02,11:10:00,S109498522,Stationary,80.0,,1
2,Alpine Accentor,Prunella collaris,37.221562,-2.548854,2020-01-02,11:10:00,S65735544,Stationary,80.0,,1
3,Audouin's Gull,Ichthyaetus audouinii,36.823035,-2.295552,2020-01-14,09:07:00,S63418164,Traveling,100.0,6.0,1
4,Audouin's Gull,Ichthyaetus audouinii,36.762976,-2.224369,2020-01-06,,S73544507,Incidental,,,0


## We need to check how many combos we'll have to call with the API, as the limit is 10k per day.


In [70]:
bird_df["date"] = pd.to_datetime(bird_df["OBSERVATION DATE"])
bird_df["lat_rounded"] = bird_df["LATITUDE"].round(1)
bird_df["lon_rounded"] = bird_df["LONGITUDE"].round(1)

unique_combos = bird_df.groupby(["date", "lat_rounded", "lon_rounded"]).size()
print(f"Unique location-date combos: {len(unique_combos):,}")

Unique location-date combos: 33,981


Fortunately, we can do bulk calls of 1k different locations, making the number of calls we have to do much more manageable.

In [86]:
unique_coords = bird_df[["lat_rounded", "lon_rounded"]].drop_duplicates()
lats = unique_coords["lat_rounded"].tolist()
lons = unique_coords["lon_rounded"].tolist()

print(lats[:4], lons[:4])

[37.2, 36.8, 36.8, 37.2] [-2.5, -2.3, -2.2, -1.8]


## Testing the fetch weather tool.

In [None]:
from src.data.fetch_weather import fetch_weather

# Test with a few coordinates from your bird data
lat = [37.2, 36.8, 36.8, 37.2]
lon = [-2.5, -2.3, -2.2, -1.8]

# Now returns daily data directly (no need to aggregate hourly to daily)
weather_df = fetch_weather("2020-01-01", "2020-04-30", lat, lon)
print(weather_df.columns.tolist())
weather_df.head()

In [None]:
# Daily aggregation no longer needed - fetch_weather now returns daily data directly

In [None]:
# Check date format for joining with bird observations
print(weather_df["date"].head())
print(bird_df["OBSERVATION DATE"].head())

In [83]:
print(bird_df["OBSERVATION DATE"].min())
print(bird_df["OBSERVATION DATE"].max())

2020-01-01
2020-04-30


## Trying to make the API calls to get all the needed coordinates.

We need to make them batched and smaller and delayed otherwise we get time outed.

In [None]:
import time

dataframes = []

for i in range(0, len(lats), 100):
    print(f"Fetching batch {i//100 + 1} of {len(lats)//100 + 1}")
    chunk_lats = lats[i:i+100]
    chunk_lons = lons[i:i+100]
    dataframes.append(fetch_weather("2020-01-01", "2020-04-30", chunk_lats, chunk_lons))
    time.sleep(30)

In [109]:
print(len(dataframes))

10


In [110]:
# Concatenate the new batches and append to the existing file
weather_df = pd.concat(dataframes)
weather_df.to_csv("../data/processed/weather_partial.csv", mode='a', header=False, index=False)
print(f"Appended {len(weather_df):,} rows to weather_partial.csv")

Appended 2,904,000 rows to weather_partial.csv


In [102]:
print(len(bird_df))

1000000
