In [None]:
from datasets import load_dataset

ds = load_dataset("yashikota/birds-525-species-image-classification")
print(ds)

In [None]:
import pandas as pd
from PIL import Image
df = pd.DataFrame(ds['train'])
df.head()

In [None]:
from IPython.display import display
for x in ds['train'].shuffle(seed=231).select(range(5)):
  display(x["image"])
  print("Label: ", ds['train'].features['label'].int2str(x['label']))

In [None]:
#Checking Label and Image for an American Wigeon
display(ds["train"][3924]["image"])
display(ds['train'].features["label"].int2str(ds['train'][3924]['label']))

In [None]:
#Displaying Features to see our classes
display(ds['train'].features)

In [None]:
# Checking for corrupted images by image link is None
bad_images = []

for i in range(len(ds['train'])):
  image = ds['train'][i]['image']
  if image is None:
    bad_images.append(i)

print(f"Total number of corrupt/null images: {len(bad_images)}")
print("First Bad Ones: ", bad_images[:25])

In [None]:
# Checking for bird images that are not the same size as the defaulted 224x224
size_unmatched = []
for i in range(len(ds['train'])):
  width, height = ds['train'][i]['image'].size
  if width != 224 or height != 224:
    size_unmatched.append(i)
print(f"Amount of images not 224x224:{len(size_unmatched)}")
print(size_unmatched[:20])

In [None]:
# Understanding size differences by checking a few of the images not our default size
for idx in size_unmatched[:10]:
  display(ds['train'][idx]['image'])

In [None]:
import torch
import torchvision.models as models
from torchvision.models import ResNet18_Weights
from PIL import Image

In [None]:
first_image = ds['train'][0]['image']

display(first_image)

image_size = first_image.size
print(f"Image size (width, height): {image_size}")

image_format = first_image.format
print(f"Image format: {image_format}")

image_mode = first_image.mode
print(f"Image mode: {image_mode}")

In [None]:
weights = ResNet18_Weights.DEFAULT
model = models.resnet18(weights=weights)
model.fc = torch.nn.Identity()
model.eval()
transform = weights.transforms()

def get_embedding(img):
 x = transform(img).unsqueeze(0)
 with torch.no_grad():
    emb = model(x).squeeze().numpy()
 return emb

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from PIL import Image

image1 = ds['train'][0]['image']
image2 = ds['train'][3]['image']

print("Image 1:")
display(image1)
print("\nImage 2:")
display(image2)


img1 = get_embedding(image1)
img2 = get_embedding(image2)


img1 = img1.reshape(1, -1)
img2 = img2.reshape(1, -1)

similarity = cosine_similarity(img1, img2)[0][0]

print(f"\nSimilarity between the first two images: {similarity}")

if similarity > 0.90:
    print("duplicates")
else:
    print("not duplicates.")



In [None]:
#class imbalances
#making a dictionary of species names and how many pictures there are of each species to filter through later
#only done for training data since we are just training the model

#print(ds['train'].features) #dictionary of names from the label
from collections import Counter #counts how many there are of each label
values=ds["train"]["label"]
counts=Counter(values)
#print(counts) #how much there are of each value, key associated with a certain species
labels=ds["train"].features["label"].names #getting the species names
countsSpecies={labels[i]: c for i, c in counts.items()} #makes a dictionary of the species and how many images there are in alphabetical order
#print(countsSpecies)

In [None]:
#assigning weights to different classes, best for datasets with medium levels of imbalance (eg. here)
#using cross entropy to have the model weigh classes with less samples more
#undersampling is not preferred due to the risk of permanently removing important data from the dataset

#for PyTorch
import torch

classCounts=torch.tensor([counts[i] for i in range(len(labels))], dtype=torch.float) #creates a list of how many there are per species alphabetically

n=classCounts.sum() #formula for computing the class weights
c=len(classCounts)
classWeights=n/(c*classCounts)
#print(classWeights)

In [None]:
#defining the loss function with these weights to properly evaluate the accuracy after training
import torch.nn as nn
criterion=nn.CrossEntropyLoss(weight=classWeights)
#all of the above goes before creating the model and training it
#with this, running the loss function weighs mistakes more heavily on the species with less pictures than speices with more pictures