# Data Profiling of 2024 and 2025 saved JPGs from Bird Feeder

In [1]:
import pandas as pd
import gcs_inventory as gcsi

no module auth.py found with key google_json_key for gcs, assuming this is running from within GCP project


In [5]:
df_file_name = 'raw_archive-jpg-list.csv'
try: 
    df_raw = pd.read_csv(df_file_name)  # reload existing file
    print(f'Loading existing file {df_file_name}')
except:
    df_raw = gcsi.get_archived_jpg_images(df_file_name)  # .01 per 1000 operations so about $0.80 per run, create from scratch
    print(f'Reading images from GCS....')

# structure raw data
df_raw['DateTime'] = pd.to_datetime(df_raw['DateTime'], errors='raise')
df_raw.to_csv(df_file_name, index=False)
print(f'Saved jpg list as {df_file_name}....')


Loading existing file raw_archive-jpg-list.csv
Saved jpg list as raw_archive-jpg-list.csv....


In [9]:
# profile df
print(f'df shape {df_raw.shape}')
print(df_raw.columns)
print(df_raw.head(5))

df shape (82371, 8)
Index(['Number', 'Name', 'Year', 'Month', 'Day', 'Hour', 'DateTime',
       'Image Name'],
      dtype='object')
   Number           Name  Year  Month  Day  Hour            DateTime  \
0       0  Mourning Dove  2023     12   17     8 2023-12-17 08:45:02   
1       1  House Sparrow  2023     12   17     8 2023-12-17 08:48:04   
2       2    House Finch  2023     12   17     8 2023-12-17 08:52:00   
3       3    House Finch  2023     12   17     8 2023-12-17 08:52:02   
4       4    House Finch  2023     12   17     8 2023-12-17 08:54:02   

                                Image Name  
0  2023-12-17-08-45-2231(MourningDove).jpg  
1  2023-12-17-08-48-4137(HouseSparrow).jpg  
2    2023-12-17-08-52-0844(HouseFinch).jpg  
3    2023-12-17-08-52-2750(HouseFinch).jpg  
4    2023-12-17-08-54-2356(HouseFinch).jpg  


In [17]:
# filter
def filter_by_year(df_raw, year: int): 
    print(f'Limiting list to {year} only....')
    df = df_raw[df_raw['DateTime'].dt.year == year].copy() # .copy() avoids warnings about setting values on slice
    print('')
    print(f'Starting date: {df["DateTime"].min()}')
    print(f'Ending date: {df["DateTime"].max()}')
    print(f'Number of Images: \t{df.shape[0]}\n')
    return df

In [18]:
# profile 2024
df = filter_by_year(df_raw=df_raw, year=2024) 
name_counts = df['Name'].value_counts()    
print(f'Possible False Positives: \n{name_counts[name_counts <= 150]}')
print('')
print(f'Remaining Species: \n{name_counts[name_counts > 150]}')

Limiting list to 2024 only....

Starting date: 2024-01-01 08:35:04
Ending date: 2024-12-31 15:18:02
Number of Images: 	74849

Possible False Positives: 
Name
White-crowned Sparrow        123
Common GroundDove             99
White-throated Sparrow        88
Chipping Sparrow              80
Cedar Waxwing                 44
Indigo Bunting                36
Harris's Sparrow              35
Lark Sparrow                  29
Song Sparrow                  27
Black Phoebe                  19
Evening Grosbeak              15
Field Sparrow                 14
Vesper Sparrow                12
Chestnut-backed Chickadee     11
American Tree Sparrow         11
Lincoln's Sparrow              8
Snow Bunting                   5
Red-bellied Woodpecker         5
Carolina Wren                  4
White-breasted Nuthatch        4
Bewick's Wren                  3
California Towhee              3
Grasshopper Sparrow            2
Plain Chachalaca               2
Gray-headed Junco              2
Eurasian Collared

In [20]:
# profile 2025
df = filter_by_year(df_raw=df_raw, year=2025) 
name_counts = df['Name'].value_counts()    
print(f'Possible False Positives: \n{name_counts[name_counts <= 10]}')
print('')
print(f'Remaining Species: \n{name_counts[name_counts > 10]}')

Limiting list to 2025 only....

Starting date: 2025-01-01 08:35:05
Ending date: 2025-01-27 14:47:02
Number of Images: 	4575

Possible False Positives: 
Name
American Tree Sparrow     6
Purple Finch              6
5, 325, 404, 635          2
White-throated Sparrow    1
95, 276, 341, 438         1
                         ..
1, 304, 243, 442          1
1, 289, 236, 462          1
1, 308, 241, 498          1
1, 323, 228, 485          1
109, 291, 312, 443        1
Name: count, Length: 201, dtype: int64

Remaining Species: 
Name
House Finch               2163
House Sparrow             1490
Northern Cardinal          333
Mourning Dove              159
Black-capped Chickadee     101
Dark-eyed Junco             84
American Goldfinch          33
Name: count, dtype: int64


Unnamed: 0,Number,Name,Year,Month,Day,Hour,DateTime,Image Name
77796,77796,Black-capped Chickadee,2025,1,1,8,2025-01-01 08:35:05,2025-01-01-08-35-522(Black-cappedChickadee).jpg
77797,77797,House Sparrow,2025,1,1,8,2025-01-01 08:45:01,2025-01-01-08-45-153(HouseSparrow).jpg
77798,77798,House Finch,2025,1,1,8,2025-01-01 08:47:05,2025-01-01-08-47-555(HouseFinch).jpg
77799,77799,House Finch,2025,1,1,8,2025-01-01 08:48:00,2025-01-01-08-48-027(HouseFinch).jpg
77800,77800,House Finch,2025,1,1,8,2025-01-01 08:48:04,2025-01-01-08-48-4811(HouseFinch).jpg
77801,77801,House Finch,2025,1,1,8,2025-01-01 08:49:00,2025-01-01-08-49-0113(HouseFinch).jpg
77802,77802,House Finch,2025,1,1,8,2025-01-01 08:49:01,2025-01-01-08-49-1719(HouseFinch).jpg
77803,77803,House Finch,2025,1,1,8,2025-01-01 08:54:05,2025-01-01-08-54-5421(HouseFinch).jpg
77804,77804,Dark-eyed Junco,2025,1,1,9,2025-01-01 09:30:05,2025-01-01-09-30-5937(Dark-eyedJunco).jpg
77805,77805,House Finch,2025,1,1,9,2025-01-01 09:38:01,2025-01-01-09-38-1143(HouseFinch).jpg


In [24]:
import static_functions
birdn = 'raw_2025-01-16-08-56-014(HouseSparrow [(237, 286, 480, 526)]).jpg'
print(static_functions.common_name(birdn))

birdn = '2023-12-17-08-45-2231(MourningDove).jpg'
print(static_functions.common_name(birdn))

237, 286, 480, 526
MourningDove
