# Data Profiling of 2024 and 2025 saved JPGs from Bird Feeder

In [8]:
import pandas as pd
import gcs_inventory as gcsi

In [9]:
df_file_name = 'raw_archive-jpg-list.csv'
try: 
    df_raw = pd.read_csv(df_file_name)  # reload existing file
    print(f'Loading existing file {df_file_name}')
except:
    df_raw = gcsi.get_archived_jpg_images(df_file_name)  # .01 per 1000 operations so about $0.80 per run, create from scratch
    print(f'Reading images from GCS....')

# clean the data and remove files that have a bad name, only numbers for species name
df_raw = df_raw[df_raw['Name'].str.contains(r'[a-zA-Z]', na=False)]  # na=False handles NaN values
    
# structure raw data
df_raw['DateTime'] = pd.to_datetime(df_raw['DateTime'], errors='raise')
df_raw.to_csv(df_file_name, index=False)
print(f'Saved jpg list as {df_file_name}....')


Reading images from GCS....
Saved jpg list as raw_archive-jpg-list.csv....


In [10]:
# profile df
print(f'df shape {df_raw.shape}')
print(df_raw.columns)
print(df_raw.head(5))

df shape (84017, 8)
Index(['Number', 'Name', 'Year', 'Month', 'Day', 'Hour', 'DateTime',
       'Image Name'],
      dtype='object')
   Number           Name  Year  Month  Day  Hour            DateTime  \
0       0  Mourning Dove  2023     12   17     8 2023-12-17 08:45:02   
1       1  House Sparrow  2023     12   17     8 2023-12-17 08:48:04   
2       2    House Finch  2023     12   17     8 2023-12-17 08:52:00   
3       3    House Finch  2023     12   17     8 2023-12-17 08:52:02   
4       4    House Finch  2023     12   17     8 2023-12-17 08:54:02   

                                Image Name  
0  2023-12-17-08-45-2231(MourningDove).jpg  
1  2023-12-17-08-48-4137(HouseSparrow).jpg  
2    2023-12-17-08-52-0844(HouseFinch).jpg  
3    2023-12-17-08-52-2750(HouseFinch).jpg  
4    2023-12-17-08-54-2356(HouseFinch).jpg  


In [11]:
# filter
def filter_by_year(df_raw, year: int): 
    print(f'Limiting list to {year} only....')
    df = df_raw[df_raw['DateTime'].dt.year == year].copy() # .copy() avoids warnings about setting values on slice
    print('')
    print(f'Starting date: {df["DateTime"].min()}')
    print(f'Ending date: {df["DateTime"].max()}')
    print(f'Number of Images: \t{df.shape[0]}\n')
    return df

In [12]:
# profile 2024
df = filter_by_year(df_raw=df_raw, year=2024) 
name_counts = df['Name'].value_counts()    
print(f'Possible False Positives: \n{name_counts[name_counts <= 150]}')
print('')
print(f'Remaining Species: \n{name_counts[name_counts > 150]}')

Limiting list to 2024 only....

Starting date: 2024-01-01 08:35:04
Ending date: 2024-12-31 15:18:02
Number of Images: 	74849

Possible False Positives: 
Name
White-crowned Sparrow        123
Common GroundDove             99
White-throated Sparrow        88
Chipping Sparrow              80
Cedar Waxwing                 44
Indigo Bunting                36
Harris's Sparrow              35
Lark Sparrow                  29
Song Sparrow                  27
Black Phoebe                  19
Evening Grosbeak              15
Field Sparrow                 14
Vesper Sparrow                12
Chestnut-backed Chickadee     11
American Tree Sparrow         11
Lincoln's Sparrow              8
Snow Bunting                   5
Red-bellied Woodpecker         5
Carolina Wren                  4
White-breasted Nuthatch        4
Bewick's Wren                  3
California Towhee              3
Grasshopper Sparrow            2
Plain Chachalaca               2
Gray-headed Junco              2
Eurasian Collared

In [15]:
# profile 2025
df = filter_by_year(df_raw=df_raw, year=2025) 
name_counts = df['Name'].value_counts()    
print(f'Possible False Positives: \n{name_counts[name_counts <= 5]}')
print('')
print(f'Remaining Species: \n{name_counts[name_counts > 5]}')

Limiting list to 2025 only....

Starting date: 2025-01-01 08:35:05
Ending date: 2025-01-27 14:47:02
Number of Images: 	6221

Possible False Positives: 
Name
White-throated Sparrow    1
Name: count, dtype: int64

Remaining Species: 
Name
House Finch               2988
House Sparrow             2198
Northern Cardinal          445
Mourning Dove              246
Dark-eyed Junco            161
Black-capped Chickadee     128
American Goldfinch          34
American Tree Sparrow       12
Purple Finch                 8
Name: count, dtype: int64


In [14]:
# import static_functions
# birdn = 'raw_2025-01-16-14-13-37824(HouseSparrow).jpg'
# print(static_functions.common_name(birdn))

# birdn = '2023-12-17-08-45-2231(MourningDove).jpg'
# print(static_functions.common_name(birdn))