In [None]:
#%pip install vit-pytorch

Collecting vit-pytorch
  Downloading vit_pytorch-1.10.1-py3-none-any.whl.metadata (69 kB)
     ---------------------------------------- 0.0/69.7 kB ? eta -:--:--
     ----- ---------------------------------- 10.2/69.7 kB ? eta -:--:--
     ---------------- --------------------- 30.7/69.7 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 69.7/69.7 kB 540.4 kB/s eta 0:00:00
Collecting einops>=0.7.0 (from vit-pytorch)
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Downloading vit_pytorch-1.10.1-py3-none-any.whl (140 kB)
   ---------------------------------------- 0.0/140.8 kB ? eta -:--:--
   ---------------------------------------- 140.8/140.8 kB 2.8 MB/s eta 0:00:00
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
   ---------------------------------------- 0.0/64.4 kB ? eta -:--:--
   ---------------------------------------- 64.4/64.4 kB 3.4 MB/s eta 0:00:00
Installing collected packages: einops, vit-pytorch
Successfully installed einops-0.8.1 vit-pyt


[notice] A new release of pip is available: 23.3.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Data Cleaning

To be determined as needed

In [1]:
##########################################################################################
#                               Data Cleaning                                            #
##########################################################################################

import pandas as pd
import os
import shutil

base_path = r"C:\Users\admin\OneDrive\Documents\results"
output_path = r"C:\Users\admin\OneDrive\Documents\filtered_species"
os.makedirs(output_path, exist_ok=True)

species_list = [
    "Anas_platyrhynchos", "Ardea_herodias", "Bombycilla_cedrorum", "Branta_canadensis",
    "Bubo_virginianus", "Buteo_jamaicensis", "Calidris_alba", "Cardinalis_cardinalis",
    "Cathartes_aura", "Coragyps_atratus", "Corvus_cornix", "Dryocopus_pileatus",
    "Falco_sparverius", "Icterus_galbula", "Parus_major", "Passer_domesticus",
    "Quiscalus_mexicanus", "Sialia_sialis", "Sturnus_vulgaris", "Turdus_merula",
    "Turdus_migratorius", "Zenaida_macroura"
]

for species in species_list:
    species_name = species.replace("_", " ")
    metadata_file = os.path.join(base_path, f"{species}_metadata.csv")
    image_folder = os.path.join(base_path, f"{species}_images")
    output_image_folder = os.path.join(output_path, f"{species}_filtered_images")
    output_csv_file = os.path.join(output_path, f"{species}_filtered_metadata.csv")
    os.makedirs(output_image_folder, exist_ok=True)

    # CSV recreate
    df = pd.read_csv(metadata_file, encoding="latin1")
    df_filtered = df[df["species_name"] == species_name]
    df_filtered[["species_name", "observation_id"]].to_csv(output_csv_file, index=False)

    # Image copy
    observation_ids = df_filtered["observation_id"].astype(str).tolist()
    image_files = os.listdir(image_folder)
    copied = 0

    for obs_id in observation_ids:
        match_suffix = f"_{obs_id}_0.jpeg"
        matched_files = [f for f in image_files if f.endswith(match_suffix)]

        for file in matched_files:
            src = os.path.join(image_folder, file)
            dst = os.path.join(output_image_folder, file)
            try:
                shutil.copyfile(src, dst)
                copied += 1
            except Exception as e:
                print(f"Error copying {file}: {e}")

    print(f"Copied {copied} image(s) for {species}, CSV rows: {len(df_filtered)}")



Copied 4081 image(s) for Anas_platyrhynchos, CSV rows: 4081
Copied 4929 image(s) for Ardea_herodias, CSV rows: 4932
Copied 5000 image(s) for Bombycilla_cedrorum, CSV rows: 5000
Copied 4821 image(s) for Branta_canadensis, CSV rows: 4821
Copied 4823 image(s) for Bubo_virginianus, CSV rows: 4823
Copied 4952 image(s) for Buteo_jamaicensis, CSV rows: 4952
Copied 5000 image(s) for Calidris_alba, CSV rows: 5000
Copied 3990 image(s) for Cardinalis_cardinalis, CSV rows: 3990
Copied 4998 image(s) for Cathartes_aura, CSV rows: 4998
Copied 4929 image(s) for Coragyps_atratus, CSV rows: 4929
Copied 4821 image(s) for Corvus_cornix, CSV rows: 4821
Copied 4989 image(s) for Dryocopus_pileatus, CSV rows: 4989
Copied 4967 image(s) for Falco_sparverius, CSV rows: 4967
Copied 4993 image(s) for Icterus_galbula, CSV rows: 4993
Copied 4890 image(s) for Parus_major, CSV rows: 4890
Copied 4973 image(s) for Passer_domesticus, CSV rows: 4974
Copied 4999 image(s) for Quiscalus_mexicanus, CSV rows: 4999
Copied 4995 

In [2]:
##########################################################################################
#                               Data Verifying                                           #
##########################################################################################

for species in species_list:
    csv_path = os.path.join(output_path, f"{species}_filtered_metadata.csv")
    img_path = os.path.join(output_path, f"{species}_filtered_images")

    df = pd.read_csv(csv_path)
    csv_obs_ids = set(df["observation_id"].astype(str))
    image_obs_ids = set()

    for filename in os.listdir(img_path):
        if filename.endswith("_0.jpeg"):
            parts = filename.split("_")
            try:
                obs_id = parts[-2]
                image_obs_ids.add(obs_id)
            except IndexError:
                print(f"Invalid filename: {filename}")

    missing = csv_obs_ids - image_obs_ids
    extra = image_obs_ids - csv_obs_ids

    if missing:
        print(f"{species} — CSV: {len(csv_obs_ids)}, Images: {len(image_obs_ids)}, Missing: {len(missing)} → {list(missing)[:5]}{'...' if len(missing) > 5 else ''}")



Ardea_herodias — CSV: 4932, Images: 4929, Missing: 3 → ['2651283', '3284364', '97898']
Passer_domesticus — CSV: 4974, Images: 4973, Missing: 1 → ['2966935']
Turdus_migratorius — CSV: 4992, Images: 4991, Missing: 1 → ['3629725']


In [3]:
##########################################################################################
#                               Handle inaccurate part                                   #
##########################################################################################
missing_data = {
    "Ardea_herodias": ["97898", "2651283", "3284364"],
    "Passer_domesticus": ["2966935"],
    "Turdus_migratorius": ["3629725"]
}

for species, missing_ids in missing_data.items():
    csv_path = f"{output_path}/{species}_filtered_metadata.csv"
    df = pd.read_csv(csv_path)
    df_cleaned = df[~df["observation_id"].astype(str).isin(missing_ids)]
    df_cleaned.to_csv(csv_path, index=False)
    print(f"Cleaned: {species} — removed {len(missing_ids)} rows")

# Run previous block to verify there is no output

Cleaned: Ardea_herodias — removed 3 rows
Cleaned: Passer_domesticus — removed 1 rows
Cleaned: Turdus_migratorius — removed 1 rows


# After Running Cleaning Code, Should be Good to Start From Here

### Data Upload

In [2]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader

batch_size = 32 

output_path = r"C:\Users\admin\OneDrive\Documents\filtered_species"

transform = transforms.Compose([
    transforms.CenterCrop((256, 256)),  #Resize to minimum of all sizes - Will update size in cnn architecture
    #transforms.Normalize(),
    transforms.RandomHorizontalFlip(p = 0.25),
    transforms.RandomRotation(degrees = 30),
    transforms.ToTensor(),
])

data = datasets.ImageFolder(root=output_path, transform=transform)

train_size = int(0.8 * len(data))
test_size = len(data) - train_size

training, testing = random_split(data, [train_size, test_size], generator=torch.Generator().manual_seed(1111))

training_dataset = DataLoader(training, batch_size=batch_size, shuffle=True)
testing_dataset = DataLoader(testing, batch_size=batch_size, shuffle=True)


#For testing purposes
#print("Class names:", data.classes)

In [4]:
next(iter(training_dataset))

[tensor([[[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           ...,
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
 
          [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           ...,
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
 
          [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.00

### Classifier Architecture

In [20]:
import torch
import torch.nn as nn
from vit_pytorch import ViT
from vit_pytorch.vit import Transformer, Attention


class BirdClassifier(nn.Module):
    #donot adjust the patch size, thats what the cnn is for
    def __init__(self, 
                 cnn_state=False, 
                 image_size=256, 
                 patch_size = 1, 
                 num_class = 22, 
                 dim = 1056, 
                 layer_count = 1,
                 head_count = 1,
                 transformer_ff_neurons = 1056,
                 transformer_dropout = 0.1
                 ):
        super().__init__()
        self.cnn_state = cnn_state

        dummy_input = torch.zeros(1, 3, image_size, image_size)

        if self.cnn_state:
            #the following architecture is alexnet -> For test one cnn_state should be flase
            self.cnn = nn.Sequential(
                nn.ReLU(nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=2)),
                nn.ReLU(nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, padding=2)),
                nn.ReLU(nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, padding=1)),
                nn.ReLU(nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, padding=1)),
                nn.ReLU(nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, padding=1)),
            )
            with torch.no_grad():
                cnn_out = self.cnn(dummy_input)
                _, out_channels, out_height, out_width = cnn_out.shape
                vit_input_size = max(out_height, out_width)
                vit_input_channels = out_channels  
                          
        else:
            # No CNN — raw input goes to ViT
            vit_input_size = image_size
            vit_input_channels = 3

        self.vision_transformer = ViT(
            image_size = vit_input_size,
            patch_size = 1,
            num_classes = num_class,
            dim = dim, #embedding in attention
            depth = layer_count, #layers in transformer _> testing this in test 1 in proposal
            heads = head_count, #testing thos in test 1 in proposal
            mlp_dim = transformer_ff_neurons, #size of mlp try to change to see if wecan make this deeper, will likely need to imple,ent transformer by hand if so - 1056 or 512 could work
            dropout = 0.1,
            emb_dropout = 0.1,
            channels = vit_input_channels
        )

    def forward(self, x):
        if self.cnn_state:
            x = self.cnn(x)
        x = self.vision_transformer(x)

        return x
            

### Forward Pass

In [11]:
#%pip install torchmetrics
%pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.7 kB ? eta -:--:--
     ------- -------------------------------- 10.2/57.7 kB ? eta -:--:--
     -------------------- ----------------- 30.7/57.7 kB 325.1 kB/s eta 0:00:01
     --------------------------------- ---- 51.2/57.7 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 57.7/57.7 kB 431.2 kB/s eta 0:00:00
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
   ---------------------------------------- 0.0/78.5 kB ? eta -:--:--
   ------------------------------------ --- 71.7/78.5 kB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 78.5/78.5 kB 2.2 MB/s eta 0:00:00
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from torchmetrics.classification import Accuracy
from tqdm import tqdm
import numpy as np


def train_model(model, dataloader, criterion, optimizer_metric, accuracy_metric, device):
    model.train()
    net_loss = 0
    for images, labels in tqdm(dataloader, desc = "TRAINIGN"):
        image, label = images.to(device), labels.to(device)
        y_hat = model(image)
        loss = criterion(y_hat, label)
        optimizer_metric.zero_grad()
        loss.backward()
        optimizer_metric.step()
        accuracy_metric.update(y_hat, label)
        net_loss += loss.item()
    
    epoch_accuracy = accuracy_metric.compute().item()
    epoch_loss = net_loss/(len(dataloader))

    return epoch_accuracy, epoch_loss

def test_model(model, dataloader, criterion, accuracy_metric, device):
    model.eval()
    net_loss = 0
    for images, labels in tqdm(dataloader, desc = "TESTING"):
        image, label = images.to(device), labels.to(device)
        y_hat = model(image)
        loss = criterion(y_hat, label)
        accuracy_metric.update(y_hat, label)
        net_loss += loss.item()
    
    epoch_accuracy = accuracy_metric.compute().item()
    epoch_loss = net_loss/(len(dataloader))

    return epoch_accuracy, epoch_loss

In [22]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BirdClassifier(cnn_state=False).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
accuracy_score = Accuracy(task = 'multiclass', num_classes = 22)

In [None]:
loss_scores_train = []
accuracy_scores_train = []

for epoch in range(1):
    accuracy_score.reset()
    trained = train_model(model, dataloader = training_dataset, criterion = criterion, optimizer_metric = optimizer, accuracy_metric=accuracy_score, device = device)
    accuracy_scores_train.append(trained[0])
    loss_scores_train.append(trained[1])
    


TRAINIGN:   0%|          | 0/2671 [00:00<?, ?it/s]

### Testing ViT

set cnn_state to False

In [None]:
loss_scores_test = []
accuracy_scores_test = []

for epoch in range(1):
    accuracy_score.reset()
    trained = train_model(model, dataloader = training_dataset, criterion = criterion, optimizer_metric = optimizer, accuracy_metric=accuracy_score, device = device)
    accuracy_scores_test.append(trained[0])
    loss_scores_test.append(trained[1])
    
