In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Image Processing
from PIL import Image
import cv2
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0

# Text Processing
import re
import nltk
from nltk.corpus import stopwords
from transformers import BertTokenizer, TFBertModel

# Machine Learning and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Data Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from albumentations import Compose, RandomCrop, HorizontalFlip, Normalize

# Miscellaneous
import os
from tqdm import tqdm
import glob
import random
from collections import defaultdict

from keras.applications.densenet import preprocess_input, DenseNet121
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input


1.1. **Завантаження даних**

- Ознайомтесь із структурою датасету (зображення, текстові описи, цільова змінна).
- Визначте кількість пропущених даних, типи змінних (категоріальні, числові, текстові).

In [2]:
text_train_data = pd.read_csv('data/train.csv')
text_test_data = pd.read_csv('data/test.csv')

print(f"Number of missing TRAIN descriptions: {text_train_data['Description'].isnull().sum()}")
print(f"Number of missing TEST descriptions: {text_test_data['Description'].isnull().sum()}")

print(f"Number of unique TRAIN PetIDs: {text_train_data['PetID'].nunique()}")
print(f"Number of unique TEST PetIDs: {text_test_data['PetID'].nunique()}")

Number of missing TRAIN descriptions: 5
Number of missing TEST descriptions: 1
Number of unique TRAIN PetIDs: 6431
Number of unique TEST PetIDs: 1891


In [3]:
image_test_dir = 'data/images/images/test'
image_train_dir = 'data/images/images/train'

image_train_files = os.listdir(image_train_dir)
image_test_files = os.listdir(image_test_dir)

In [4]:
image_train_ids = [filename.split('-')[0] for filename in image_train_files]
image_test_ids = [filename.split('-')[0] for filename in image_test_files]

unique_train_image_pet_ids = set(image_train_ids)
print(f"Number of unique train PetIDs in images: {len(unique_train_image_pet_ids)}")

unique_test_image_pet_ids = set(image_test_ids)
print(f"Number of unique train PetIDs in images: {len(unique_test_image_pet_ids)}")

Number of unique train PetIDs in images: 6431
Number of unique train PetIDs in images: 1899


In [5]:
text_train_pet_ids = set(text_train_data['PetID'])
text_test_pet_ids = set(text_test_data['PetID'])

# Find PetIDs with images but no descriptions
pet_train_ids_with_images_only = unique_train_image_pet_ids - text_train_pet_ids
print(f"PetIDs TRAIN with images but no descriptions: {len(pet_train_ids_with_images_only)}")

# Find PetIDs with images but no descriptions
pet_test_ids_with_images_only = unique_test_image_pet_ids - text_test_pet_ids
print(f"PetIDs TEST with images but no descriptions: {len(pet_test_ids_with_images_only)}")

# Find PetIDs with descriptions but no images
pet_train_ids_with_descriptions_only = text_train_pet_ids - text_train_pet_ids
print(f"PetIDs TRAIN with descriptions but no images: {len(pet_train_ids_with_descriptions_only)}")

# Find PetIDs with descriptions but no images
pet_test_ids_with_descriptions_only = text_test_pet_ids - text_test_pet_ids
print(f"PetIDs TEST with descriptions but no images: {len(pet_test_ids_with_descriptions_only)}")

PetIDs TRAIN with images but no descriptions: 0
PetIDs TEST with images but no descriptions: 12
PetIDs TRAIN with descriptions but no images: 0
PetIDs TEST with descriptions but no images: 0


In [6]:
# Convert text PetIDs to sets
text_train_pet_ids = set(text_train_data['PetID'])
text_test_pet_ids = set(text_test_data['PetID'])

# Find PetIDs with images but no descriptions
train_image_ids_no_description = unique_train_image_pet_ids - text_train_pet_ids
test_image_ids_no_description = unique_test_image_pet_ids - text_test_pet_ids

# Output the results
print(f"Train images with no descriptions: {len(train_image_ids_no_description)}")
print(f"Test images with no descriptions: {len(test_image_ids_no_description)}")
print(f"IDs of Train images with no descriptions: {train_image_ids_no_description}")
print(f"IDs of Test images with no descriptions: {test_image_ids_no_description}")

Train images with no descriptions: 0
Test images with no descriptions: 12
IDs of Train images with no descriptions: set()
IDs of Test images with no descriptions: {'2689341e7', '2514503e7', '063521459', '515462e67', '035992662', '7759517e2', '670535e94', '554965e66', '081301773', '02126e289', '867057e77', '095314294'}


## Imputation

In [7]:
# Use the recommended approach to fill missing values
text_train_data['Description'] = text_train_data['Description'].fillna('No description provided')
text_test_data['Description'] = text_test_data['Description'].fillna('No description provided')


In [8]:
# List of missing PetIDs (already inspected as relevant)
missing_test_ids = list(test_image_ids_no_description)

# Create a DataFrame with placeholder descriptions
missing_test_data = pd.DataFrame({
    'PetID': missing_test_ids,
    'Description': ['No description provided'] * len(missing_test_ids)
})

# Append the new data to the test dataset
text_test_data = pd.concat([text_test_data, missing_test_data], ignore_index=True)

# Verify the updated dataset
print(f"Updated test data: {len(text_test_data)} rows")
print(f"Number of missing descriptions: {text_test_data['Description'].isnull().sum()}")


Updated test data: 1903 rows
Number of missing descriptions: 0


1.2. **Дослідження зображень**

- Перегляньте приклади фотографій тварин, їх розмір, роздільну здатність.
- Визначте, чи є декілька зображень для одного об’єкта (агрегація може знадобитися).

In [9]:
# List all image files
image_files = os.listdir(image_test_dir)

# Randomly select 5 images for inspection
sample_images = random.sample(image_files, 5)

# Display the images with their details
for image_file in sample_images:
    image_path = os.path.join(image_test_dir, image_file)
    with Image.open(image_path) as img:
        print(f"Image: {image_file}")
        print(f"Size: {img.size}, Format: {img.format}, Mode: {img.mode}")
        img.show()  # This opens the image in the default viewer

Image: 640dbb3b6-4.jpg
Size: (640, 428), Format: JPEG, Mode: RGB
Image: 9cda44cdf-7.jpg
Size: (400, 300), Format: JPEG, Mode: RGB
Image: 33f89cb83-2.jpg
Size: (640, 478), Format: JPEG, Mode: RGB
Image: a06a2ee4e-5.jpg
Size: (400, 343), Format: JPEG, Mode: RGB
Image: 737413132-1.jpg
Size: (300, 400), Format: JPEG, Mode: RGB


In [10]:
# Initialize lists to store image dimensions
widths, heights = [], []

for image_file in image_files:
    image_path = os.path.join(image_test_dir, image_file)
    with Image.open(image_path) as img:
        widths.append(img.size[0])
        heights.append(img.size[1])

# Calculate statistics
print(f"Number of images: {len(image_files)}")
print(f"Average width: {sum(widths) / len(widths):.2f}")
print(f"Average height: {sum(heights) / len(heights):.2f}")
print(f"Minimum resolution: {min(widths)}x{min(heights)}")
print(f"Maximum resolution: {max(widths)}x{max(heights)}")

Number of images: 9448
Average width: 402.79
Average height: 390.01
Minimum resolution: 72x35
Maximum resolution: 1792x3184


In [11]:
# Create a dictionary to count images per PetID
image_count_per_pet = defaultdict(int)

for image_file in image_train_files:
    pet_id = image_file.split('-')[0]  # Extract the PetID
    image_count_per_pet[pet_id] += 1

# Count the number of PetIDs with multiple images
multiple_images = {pet_id: count for pet_id, count in image_count_per_pet.items() if count > 1}

print(f"Total PetIDs: {len(image_count_per_pet)}")
print(f"Number of PetIDs with multiple images: {len(multiple_images)}")

# Display some examples of multiple images per PetID
for pet_id, count in list(multiple_images.items())[:5]:
    print(f"PetID: {pet_id}, Number of images: {count}")


Total PetIDs: 6431
Number of PetIDs with multiple images: 5307
PetID: 2d725d001, Number of images: 3
PetID: a63364c39, Number of images: 6
PetID: ea055de86, Number of images: 26
PetID: 0db65104a, Number of images: 5
PetID: 76024f2ed, Number of images: 5


In [12]:
# Create a dictionary to count images per PetID
image_count_per_pet = defaultdict(int)

for image_file in image_test_files:
    pet_id = image_file.split('-')[0]  # Extract the PetID
    image_count_per_pet[pet_id] += 1

# Count the number of PetIDs with multiple images
multiple_images = {pet_id: count for pet_id, count in image_count_per_pet.items() if count > 1}

print(f"Total PetIDs: {len(image_count_per_pet)}")
print(f"Number of PetIDs with multiple images: {len(multiple_images)}")

# Display some examples of multiple images per PetID
for pet_id, count in list(multiple_images.items())[:5]:
    print(f"PetID: {pet_id}, Number of images: {count}")

Total PetIDs: 1899
Number of PetIDs with multiple images: 1687
PetID: bf9bd91e1, Number of images: 8
PetID: 16ffedcf8, Number of images: 7
PetID: 7a12a494f, Number of images: 9
PetID: d1870f34b, Number of images: 3
PetID: 856005eae, Number of images: 5


## Агрегація

In [14]:
# Initialize DenseNet121 model for feature extraction
inp = Input((256, 256, 3))
backbone = DenseNet121(input_tensor=inp, include_top=False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
m = Model(inp, x)

# Process Train Images
train_pet_ids = text_train_data['PetID'].unique()
train_aggregated_features = extract_and_aggregate_features(image_train_files, train_pet_ids)

# Save Train Features
train_feats_df = pd.DataFrame.from_dict(train_aggregated_features, orient='index')
train_feats_df.to_csv('train_img_features_aggregated.csv', index_label='PetID')

# Process Test Images
test_pet_ids = text_test_data['PetID'].unique()
test_aggregated_features = extract_and_aggregate_features(image_test_files, test_pet_ids)

# Save Test Features
test_feats_df = pd.DataFrame.from_dict(test_aggregated_features, orient='index')
test_feats_df.to_csv('test_img_features_aggregated.csv', index_label='PetID')

Processing images:   2%|▏         | 108/6431 [00:00<00:11, 538.61it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing images:   3%|▎         | 215/6431 [00:00<00:16, 368.29it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

Processing images:   5%|▌         | 349/6431 [00:01<00:18, 325.83it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit,

In [15]:
# Load train and test embeddings
train_embeddings = pd.read_csv('train_img_features_aggregated.csv', index_col='PetID')
test_embeddings = pd.read_csv('test_img_features_aggregated.csv', index_col='PetID')

train_embeddings.shape
test_embeddings.shape

(0, 0)