In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Image Processing
from PIL import Image
import cv2
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0

# Text Processing
import re
import nltk
from nltk.corpus import stopwords
from transformers import BertTokenizer, TFBertModel

# Machine Learning and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Data Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from albumentations import Compose, RandomCrop, HorizontalFlip, Normalize

# Miscellaneous
import os
from tqdm import tqdm
import glob
import random
from collections import defaultdict

from keras.applications.densenet import preprocess_input, DenseNet121
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


1.1. **Завантаження даних**

- Ознайомтесь із структурою датасету (зображення, текстові описи, цільова змінна).
- Визначте кількість пропущених даних, типи змінних (категоріальні, числові, текстові).

In [2]:
text_train_data = pd.read_csv('data/train.csv')
text_test_data = pd.read_csv('data/test.csv')

print(f"Number of missing TRAIN descriptions: {text_train_data['Description'].isnull().sum()}")
print(f"Number of missing TEST descriptions: {text_test_data['Description'].isnull().sum()}")

print(f"Number of unique TRAIN PetIDs: {text_train_data['PetID'].nunique()}")
print(f"Number of unique TEST PetIDs: {text_test_data['PetID'].nunique()}")

Number of missing TRAIN descriptions: 5
Number of missing TEST descriptions: 1
Number of unique TRAIN PetIDs: 6431
Number of unique TEST PetIDs: 1891


In [3]:
image_train_dir = 'data/images/images/train'
image_test_dir = 'data/images/images/test'

image_train_files = os.listdir(image_train_dir)
image_test_files = os.listdir(image_test_dir)

In [4]:
image_train_ids = [filename.split('-')[0] for filename in image_train_files]
image_test_ids = [filename.split('-')[0] for filename in image_test_files]

unique_train_image_pet_ids = set(image_train_ids)
print(f"Number of unique train PetIDs in images: {len(unique_train_image_pet_ids)}")

unique_test_image_pet_ids = set(image_test_ids)
print(f"Number of unique train PetIDs in images: {len(unique_test_image_pet_ids)}")

Number of unique train PetIDs in images: 6431
Number of unique train PetIDs in images: 1899


In [5]:
text_train_pet_ids = set(text_train_data['PetID'])
text_test_pet_ids = set(text_test_data['PetID'])

# Find PetIDs with images but no descriptions
pet_train_ids_with_images_only = unique_train_image_pet_ids - text_train_pet_ids
print(f"PetIDs TRAIN with images but no descriptions: {len(pet_train_ids_with_images_only)}")

# Find PetIDs with images but no descriptions
pet_test_ids_with_images_only = unique_test_image_pet_ids - text_test_pet_ids
print(f"PetIDs TEST with images but no descriptions: {len(pet_test_ids_with_images_only)}")

# Find PetIDs with descriptions but no images
pet_train_ids_with_descriptions_only = text_train_pet_ids - text_train_pet_ids
print(f"PetIDs TRAIN with descriptions but no images: {len(pet_train_ids_with_descriptions_only)}")

# Find PetIDs with descriptions but no images
pet_test_ids_with_descriptions_only = text_test_pet_ids - text_test_pet_ids
print(f"PetIDs TEST with descriptions but no images: {len(pet_test_ids_with_descriptions_only)}")

PetIDs TRAIN with images but no descriptions: 0
PetIDs TEST with images but no descriptions: 12
PetIDs TRAIN with descriptions but no images: 0
PetIDs TEST with descriptions but no images: 0


In [6]:
# Convert text PetIDs to sets
text_train_pet_ids = set(text_train_data['PetID'])
text_test_pet_ids = set(text_test_data['PetID'])

# Find PetIDs with images but no descriptions
train_image_ids_no_description = unique_train_image_pet_ids - text_train_pet_ids
test_image_ids_no_description = unique_test_image_pet_ids - text_test_pet_ids

# Output the results
print(f"Train images with no descriptions: {len(train_image_ids_no_description)}")
print(f"Test images with no descriptions: {len(test_image_ids_no_description)}")
print(f"IDs of Train images with no descriptions: {train_image_ids_no_description}")
print(f"IDs of Test images with no descriptions: {test_image_ids_no_description}")

Train images with no descriptions: 0
Test images with no descriptions: 12
IDs of Train images with no descriptions: set()
IDs of Test images with no descriptions: {'035992662', '2689341e7', '081301773', '515462e67', '02126e289', '2514503e7', '554965e66', '095314294', '670535e94', '867057e77', '7759517e2', '063521459'}


## Imputation

In [7]:
# Use the recommended approach to fill missing values
text_train_data['Description'] = text_train_data['Description'].fillna('No description provided')
text_test_data['Description'] = text_test_data['Description'].fillna('No description provided')


In [8]:
# List of missing PetIDs (already inspected as relevant)
missing_test_ids = list(test_image_ids_no_description)

# Create a DataFrame with placeholder descriptions
missing_test_data = pd.DataFrame({
    'PetID': missing_test_ids,
    'Description': ['No description provided'] * len(missing_test_ids)
})

# Append the new data to the test dataset
text_test_data = pd.concat([text_test_data, missing_test_data], ignore_index=True)

# Verify the updated dataset
print(f"Updated test data: {len(text_test_data)} rows")
print(f"Number of missing descriptions: {text_test_data['Description'].isnull().sum()}")


Updated test data: 1903 rows
Number of missing descriptions: 0


1.2. **Дослідження зображень**

- Перегляньте приклади фотографій тварин, їх розмір, роздільну здатність.
- Визначте, чи є декілька зображень для одного об’єкта (агрегація може знадобитися).

In [9]:
# List all image files
image_files = os.listdir(image_test_dir)

# Randomly select 5 images for inspection
sample_images = random.sample(image_files, 5)

# Display the images with their details
for image_file in sample_images:
    image_path = os.path.join(image_test_dir, image_file)
    with Image.open(image_path) as img:
        print(f"Image: {image_file}")
        print(f"Size: {img.size}, Format: {img.format}, Mode: {img.mode}")
        img.show()  # This opens the image in the default viewer

Image: 2d72ef0c4-2.jpg
Size: (360, 480), Format: JPEG, Mode: RGB
Image: 47ad7a82a-1.jpg
Size: (384, 480), Format: JPEG, Mode: RGB
Image: 7fe0a86f8-2.jpg
Size: (300, 400), Format: JPEG, Mode: RGB
Image: cc25fb529-1.jpg
Size: (266, 400), Format: JPEG, Mode: RGB
Image: 7d835cf7c-5.jpg
Size: (399, 263), Format: JPEG, Mode: RGB


In [10]:
# Initialize lists to store image dimensions
widths, heights = [], []

for image_file in image_files:
    image_path = os.path.join(image_test_dir, image_file)
    with Image.open(image_path) as img:
        widths.append(img.size[0])
        heights.append(img.size[1])

# Calculate statistics
print(f"Number of images: {len(image_files)}")
print(f"Average width: {sum(widths) / len(widths):.2f}")
print(f"Average height: {sum(heights) / len(heights):.2f}")
print(f"Minimum resolution: {min(widths)}x{min(heights)}")
print(f"Maximum resolution: {max(widths)}x{max(heights)}")

Number of images: 9448
Average width: 402.79
Average height: 390.01
Minimum resolution: 72x35
Maximum resolution: 1792x3184


In [11]:
# Create a dictionary to count images per PetID
image_count_per_pet = defaultdict(int)

for image_file in image_train_files:
    pet_id = image_file.split('-')[0]  # Extract the PetID
    image_count_per_pet[pet_id] += 1

# Count the number of PetIDs with multiple images
multiple_images = {pet_id: count for pet_id, count in image_count_per_pet.items() if count > 1}

print(f"Total PetIDs: {len(image_count_per_pet)}")
print(f"Number of PetIDs with multiple images: {len(multiple_images)}")

# Display some examples of multiple images per PetID
for pet_id, count in list(multiple_images.items())[:5]:
    print(f"PetID: {pet_id}, Number of images: {count}")


Total PetIDs: 6431
Number of PetIDs with multiple images: 5307
PetID: 2d725d001, Number of images: 3
PetID: a63364c39, Number of images: 6
PetID: ea055de86, Number of images: 26
PetID: 0db65104a, Number of images: 5
PetID: 76024f2ed, Number of images: 5


In [12]:
# Create a dictionary to count images per PetID
image_count_per_pet = defaultdict(int)

for image_file in image_test_files:
    pet_id = image_file.split('-')[0]  # Extract the PetID
    image_count_per_pet[pet_id] += 1

# Count the number of PetIDs with multiple images
multiple_images = {pet_id: count for pet_id, count in image_count_per_pet.items() if count > 1}

print(f"Total PetIDs: {len(image_count_per_pet)}")
print(f"Number of PetIDs with multiple images: {len(multiple_images)}")

# Display some examples of multiple images per PetID
for pet_id, count in list(multiple_images.items())[:5]:
    print(f"PetID: {pet_id}, Number of images: {count}")

Total PetIDs: 1899
Number of PetIDs with multiple images: 1687
PetID: bf9bd91e1, Number of images: 8
PetID: 16ffedcf8, Number of images: 7
PetID: 7a12a494f, Number of images: 9
PetID: d1870f34b, Number of images: 3
PetID: 856005eae, Number of images: 5


## Агрегація

In [13]:
class ImageDataset(Dataset):
    def __init__(self, pet_ids, image_dir, img_size=256, transform=None):
        self.pet_ids = pet_ids
        self.image_dir = image_dir
        self.img_size = img_size
        self.transform = transform

    def __len__(self):
        return len(self.pet_ids)

    def __getitem__(self, idx):
        pet_id = self.pet_ids[idx]
        file_path = f"{self.image_dir}/{pet_id}-1.jpg"

        if not os.path.exists(file_path):
            return pet_id, None  # Ensure exactly two elements are returned
        
        image = cv2.imread(file_path)
        if image is None:
            return pet_id, None  # Ensure exactly two elements are returned

        # Resize and pad the image
        image = resize_to_square(image, self.img_size)
        
        # Apply optional transforms
        if self.transform:
            image = self.transform(image)

        return pet_id, image  # Return a tuple of pet_id and image



In [14]:
def resize_to_square(im, img_size=256):
    old_size = im.shape[:2]
    ratio = float(img_size) / max(old_size)
    new_size = tuple([int(x * ratio) for x in old_size])
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
    left, right = delta_w // 2, delta_w - (delta_w // 2)
    color = [0, 0, 0]
    return cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)

def extract_features(data_loader, model, output_file):
    features = {}  # Dictionary to store features

    for batch in tqdm(data_loader, desc="Processing batches"):
        valid_pet_ids, valid_images = [], []

        # Process each tuple in the batch
        for pet_id, image in batch:
            if image is not None:  # Filter out invalid images
                valid_pet_ids.append(pet_id)
                valid_images.append(image)

        if not valid_images:  # Skip if no valid images in the batch
            continue

        # Stack images into a batch and preprocess
        batch_images = np.array(valid_images)  # Convert to NumPy array
        batch_images = preprocess_input(batch_images)  # Preprocess for DenseNet

        # Extract embeddings
        batch_embeddings = model.predict(batch_images)

        # Save embeddings in the dictionary
        for pet_id, embedding in zip(valid_pet_ids, batch_embeddings):
            features[pet_id] = embedding

    # Convert features dictionary to DataFrame and save
    features_df = pd.DataFrame.from_dict(features, orient="index")
    features_df.to_csv(output_file, index_label="PetID")
    print("Extracting features has been finished")
    

# Custom collate function to return individual pairs
def collate_fn(batch):
    return [(item[0], item[1]) for item in batch]

In [15]:
# Train DataLoader
train_pet_ids = text_train_data['PetID'].unique()
train_dataset = ImageDataset(pet_ids=train_pet_ids, image_dir=image_train_dir)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn)

# Test DataLoader
test_pet_ids = text_test_data['PetID'].unique()
test_dataset = ImageDataset(pet_ids=test_pet_ids, image_dir=image_test_dir)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn)


In [16]:
# Create DenseNet model
inp = Input((256, 256, 3))
backbone = DenseNet121(input_tensor=inp, include_top=False)
x = GlobalAveragePooling2D()(backbone.output)
m = Model(inp, x)

In [17]:
# Extract and save train embeddings
extract_features(train_loader, m, "train_img_features.csv")

# Extract and save test embeddings
extract_features(test_loader, m, "test_img_features.csv")

Processing batches:   0%|          | 0/402 [00:00<?, ?it/s]2024-12-15 18:33:47.938156: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




Processing batches:   0%|          | 1/402 [00:01<07:20,  1.10s/it]



Processing batches:   0%|          | 2/402 [00:01<04:32,  1.47it/s]



Processing batches:   1%|          | 3/402 [00:01<03:36,  1.84it/s]



Processing batches:   1%|          | 4/402 [00:02<03:11,  2.08it/s]



Processing batches:   1%|          | 5/402 [00:02<02:58,  2.23it/s]



Processing batches:   1%|▏         | 6/402 [00:03<02:48,  2.34it/s]



Processing batches:   2%|▏         | 7/402 [00:03<02:42,  2.43it/s]



Processing batches:   2%|▏         | 8/402 [00:03<02:39,  2.47it/s]



Processing batches:   2%|▏         | 9/402 [00:04<02:37,  2.50it/s]



Processing batches:   2%|▏         | 10/402 [00:04<02:39,  2.46it/s]



Processing batches:   3%|▎         | 11/402 [00:05<02:40,  2.43it/s]



Processing batches:   3%|▎         | 12/402 [00:05<02:42,  2.39it/s]



Processing batches:   3%|▎         | 13/402 [00:05<02:40,  2.43it/s]



Processing batches:   3%|▎         | 14/402 [00:06<02:40,  2.41it/s]



Processing batches:   4%|▎         | 15/402 [00:06<02:41,  2.40it/s]



Processing batches:   4%|▍         | 16/402 [00:07<02:38,  2.44it/s]



Processing batches:   4%|▍         | 17/402 [00:07<02:35,  2.48it/s]



Processing batches:   4%|▍         | 18/402 [00:07<02:33,  2.49it/s]



Processing batches:   5%|▍         | 19/402 [00:08<02:34,  2.47it/s]



Processing batches:   5%|▍         | 20/402 [00:08<02:37,  2.43it/s]



Processing batches:   5%|▌         | 21/402 [00:09<02:38,  2.40it/s]



Processing batches:   5%|▌         | 22/402 [00:09<02:37,  2.42it/s]



Processing batches:   6%|▌         | 23/402 [00:09<02:36,  2.41it/s]



Processing batches:   6%|▌         | 24/402 [00:10<02:37,  2.39it/s]



Processing batches:   6%|▌         | 25/402 [00:10<02:38,  2.38it/s]



Processing batches:   6%|▋         | 26/402 [00:11<02:35,  2.41it/s]



Processing batches:   7%|▋         | 27/402 [00:11<02:37,  2.38it/s]



Processing batches:   7%|▋         | 28/402 [00:12<02:38,  2.36it/s]



Processing batches:   7%|▋         | 29/402 [00:12<02:36,  2.39it/s]



Processing batches:   7%|▋         | 30/402 [00:12<02:35,  2.39it/s]



Processing batches:   8%|▊         | 31/402 [00:13<02:33,  2.41it/s]



Processing batches:   8%|▊         | 32/402 [00:13<02:31,  2.44it/s]



Processing batches:   8%|▊         | 33/402 [00:14<02:28,  2.48it/s]



Processing batches:   8%|▊         | 34/402 [00:14<02:28,  2.48it/s]



Processing batches:   9%|▊         | 35/402 [00:14<02:27,  2.48it/s]



Processing batches:   9%|▉         | 36/402 [00:15<02:26,  2.49it/s]



Processing batches:   9%|▉         | 37/402 [00:15<02:26,  2.49it/s]



Processing batches:   9%|▉         | 38/402 [00:16<02:26,  2.48it/s]



Processing batches:  10%|▉         | 39/402 [00:16<02:25,  2.49it/s]



Processing batches:  10%|▉         | 40/402 [00:16<02:25,  2.49it/s]



Processing batches:  10%|█         | 41/402 [00:17<02:24,  2.50it/s]



Processing batches:  10%|█         | 42/402 [00:17<02:27,  2.45it/s]



Processing batches:  11%|█         | 43/402 [00:18<02:27,  2.43it/s]



Processing batches:  11%|█         | 44/402 [00:18<02:29,  2.39it/s]



Processing batches:  11%|█         | 45/402 [00:19<02:32,  2.34it/s]



Processing batches:  11%|█▏        | 46/402 [00:19<02:33,  2.32it/s]



Processing batches:  12%|█▏        | 47/402 [00:19<02:35,  2.29it/s]



Processing batches:  12%|█▏        | 48/402 [00:20<02:36,  2.27it/s]



Processing batches:  12%|█▏        | 49/402 [00:20<02:35,  2.26it/s]



Processing batches:  12%|█▏        | 50/402 [00:21<02:35,  2.27it/s]



Processing batches:  13%|█▎        | 51/402 [00:21<02:33,  2.29it/s]



Processing batches:  13%|█▎        | 52/402 [00:22<02:31,  2.31it/s]



Processing batches:  13%|█▎        | 53/402 [00:22<02:27,  2.36it/s]



Processing batches:  13%|█▎        | 54/402 [00:22<02:25,  2.39it/s]



Processing batches:  14%|█▎        | 55/402 [00:23<02:23,  2.42it/s]



Processing batches:  14%|█▍        | 56/402 [00:23<02:23,  2.41it/s]



Processing batches:  14%|█▍        | 57/402 [00:24<02:23,  2.41it/s]



Processing batches:  14%|█▍        | 58/402 [00:24<02:22,  2.41it/s]



Processing batches:  15%|█▍        | 59/402 [00:24<02:21,  2.42it/s]



Processing batches:  15%|█▍        | 60/402 [00:25<02:22,  2.40it/s]



Processing batches:  15%|█▌        | 61/402 [00:25<02:21,  2.41it/s]



Processing batches:  15%|█▌        | 62/402 [00:26<02:20,  2.41it/s]



Processing batches:  16%|█▌        | 63/402 [00:26<02:20,  2.42it/s]



Processing batches:  16%|█▌        | 64/402 [00:27<02:18,  2.44it/s]



Processing batches:  16%|█▌        | 65/402 [00:27<02:17,  2.45it/s]



Processing batches:  16%|█▋        | 66/402 [00:27<02:16,  2.46it/s]



Processing batches:  17%|█▋        | 67/402 [00:28<02:16,  2.45it/s]



Processing batches:  17%|█▋        | 68/402 [00:28<02:16,  2.45it/s]



Processing batches:  17%|█▋        | 69/402 [00:29<02:15,  2.45it/s]



Processing batches:  17%|█▋        | 70/402 [00:29<02:15,  2.46it/s]



Processing batches:  18%|█▊        | 71/402 [00:29<02:14,  2.46it/s]



Processing batches:  18%|█▊        | 72/402 [00:30<02:15,  2.44it/s]



Processing batches:  18%|█▊        | 73/402 [00:30<02:15,  2.43it/s]



Processing batches:  18%|█▊        | 74/402 [00:31<02:15,  2.43it/s]



Processing batches:  19%|█▊        | 75/402 [00:31<02:13,  2.45it/s]



Processing batches:  19%|█▉        | 76/402 [00:31<02:11,  2.48it/s]



Processing batches:  19%|█▉        | 77/402 [00:32<02:09,  2.50it/s]



Processing batches:  19%|█▉        | 78/402 [00:32<02:10,  2.49it/s]



Processing batches:  20%|█▉        | 79/402 [00:33<02:10,  2.47it/s]



Processing batches:  20%|█▉        | 80/402 [00:33<02:10,  2.46it/s]



Processing batches:  20%|██        | 81/402 [00:33<02:10,  2.46it/s]



Processing batches:  20%|██        | 82/402 [00:34<02:11,  2.44it/s]



Processing batches:  21%|██        | 83/402 [00:34<02:10,  2.44it/s]



Processing batches:  21%|██        | 84/402 [00:35<02:10,  2.45it/s]



Processing batches:  21%|██        | 85/402 [00:35<02:09,  2.45it/s]



Processing batches:  21%|██▏       | 86/402 [00:36<02:09,  2.44it/s]



Processing batches:  22%|██▏       | 87/402 [00:36<02:08,  2.46it/s]



Processing batches:  22%|██▏       | 88/402 [00:36<02:06,  2.48it/s]



Processing batches:  22%|██▏       | 89/402 [00:37<02:07,  2.45it/s]



Processing batches:  22%|██▏       | 90/402 [00:37<02:07,  2.44it/s]



Processing batches:  23%|██▎       | 91/402 [00:38<02:07,  2.44it/s]



Processing batches:  23%|██▎       | 92/402 [00:38<02:06,  2.45it/s]



Processing batches:  23%|██▎       | 93/402 [00:38<02:05,  2.47it/s]



Processing batches:  23%|██▎       | 94/402 [00:39<02:04,  2.47it/s]



Processing batches:  24%|██▎       | 95/402 [00:39<02:05,  2.45it/s]



Processing batches:  24%|██▍       | 96/402 [00:40<02:04,  2.47it/s]



Processing batches:  24%|██▍       | 97/402 [00:40<02:03,  2.47it/s]



Processing batches:  24%|██▍       | 98/402 [00:40<02:01,  2.49it/s]



Processing batches:  25%|██▍       | 99/402 [00:41<02:01,  2.50it/s]



Processing batches:  25%|██▍       | 100/402 [00:41<02:01,  2.49it/s]



Processing batches:  25%|██▌       | 101/402 [00:42<02:00,  2.50it/s]



Processing batches:  25%|██▌       | 102/402 [00:42<02:00,  2.50it/s]



Processing batches:  26%|██▌       | 103/402 [00:42<01:59,  2.50it/s]



Processing batches:  26%|██▌       | 104/402 [00:43<02:00,  2.48it/s]



Processing batches:  26%|██▌       | 105/402 [00:43<01:59,  2.49it/s]



Processing batches:  26%|██▋       | 106/402 [00:44<01:59,  2.47it/s]



Processing batches:  27%|██▋       | 107/402 [00:44<01:58,  2.48it/s]



Processing batches:  27%|██▋       | 108/402 [00:44<01:57,  2.50it/s]



Processing batches:  27%|██▋       | 109/402 [00:45<01:57,  2.50it/s]



Processing batches:  27%|██▋       | 110/402 [00:45<01:55,  2.52it/s]



Processing batches:  28%|██▊       | 111/402 [00:46<01:56,  2.50it/s]



Processing batches:  28%|██▊       | 112/402 [00:46<01:56,  2.48it/s]



Processing batches:  28%|██▊       | 113/402 [00:46<01:56,  2.48it/s]



Processing batches:  28%|██▊       | 114/402 [00:47<01:56,  2.48it/s]



Processing batches:  29%|██▊       | 115/402 [00:47<01:55,  2.48it/s]



Processing batches:  29%|██▉       | 116/402 [00:48<01:55,  2.47it/s]



Processing batches:  29%|██▉       | 117/402 [00:48<01:54,  2.48it/s]



Processing batches:  29%|██▉       | 118/402 [00:48<01:53,  2.51it/s]



Processing batches:  30%|██▉       | 119/402 [00:49<01:52,  2.52it/s]



Processing batches:  30%|██▉       | 120/402 [00:49<01:51,  2.53it/s]



Processing batches:  30%|███       | 121/402 [00:50<01:50,  2.53it/s]



Processing batches:  30%|███       | 122/402 [00:50<01:51,  2.50it/s]



Processing batches:  31%|███       | 123/402 [00:50<01:52,  2.49it/s]



Processing batches:  31%|███       | 124/402 [00:51<01:52,  2.48it/s]



Processing batches:  31%|███       | 125/402 [00:51<01:51,  2.48it/s]



Processing batches:  31%|███▏      | 126/402 [00:52<01:52,  2.46it/s]



Processing batches:  32%|███▏      | 127/402 [00:52<01:52,  2.45it/s]



Processing batches:  32%|███▏      | 128/402 [00:52<01:51,  2.47it/s]



Processing batches:  32%|███▏      | 129/402 [00:53<01:50,  2.48it/s]



Processing batches:  32%|███▏      | 130/402 [00:53<01:49,  2.48it/s]



Processing batches:  33%|███▎      | 131/402 [00:54<01:49,  2.46it/s]



Processing batches:  33%|███▎      | 132/402 [00:54<01:48,  2.48it/s]



Processing batches:  33%|███▎      | 133/402 [00:54<01:49,  2.45it/s]



Processing batches:  33%|███▎      | 134/402 [00:55<01:49,  2.46it/s]



Processing batches:  34%|███▎      | 135/402 [00:55<01:48,  2.46it/s]



Processing batches:  34%|███▍      | 136/402 [00:56<01:48,  2.45it/s]



Processing batches:  34%|███▍      | 137/402 [00:56<01:48,  2.43it/s]



Processing batches:  34%|███▍      | 138/402 [00:57<01:48,  2.44it/s]



Processing batches:  35%|███▍      | 139/402 [00:57<01:47,  2.44it/s]



Processing batches:  35%|███▍      | 140/402 [00:57<01:46,  2.46it/s]



Processing batches:  35%|███▌      | 141/402 [00:58<01:46,  2.46it/s]



Processing batches:  35%|███▌      | 142/402 [00:58<01:44,  2.48it/s]



Processing batches:  36%|███▌      | 143/402 [00:59<01:44,  2.49it/s]



Processing batches:  36%|███▌      | 144/402 [00:59<01:45,  2.45it/s]



Processing batches:  36%|███▌      | 145/402 [00:59<01:45,  2.44it/s]



Processing batches:  36%|███▋      | 146/402 [01:00<01:45,  2.43it/s]



Processing batches:  37%|███▋      | 147/402 [01:00<01:45,  2.42it/s]



Processing batches:  37%|███▋      | 148/402 [01:01<01:45,  2.41it/s]



Processing batches:  37%|███▋      | 149/402 [01:01<01:44,  2.43it/s]



Processing batches:  37%|███▋      | 150/402 [01:01<01:44,  2.41it/s]



Processing batches:  38%|███▊      | 151/402 [01:02<01:43,  2.42it/s]



Processing batches:  38%|███▊      | 152/402 [01:02<01:42,  2.43it/s]



Processing batches:  38%|███▊      | 153/402 [01:03<01:41,  2.45it/s]



Processing batches:  38%|███▊      | 154/402 [01:03<01:40,  2.47it/s]



Processing batches:  39%|███▊      | 155/402 [01:03<01:41,  2.44it/s]



Processing batches:  39%|███▉      | 156/402 [01:04<01:42,  2.41it/s]



Processing batches:  39%|███▉      | 157/402 [01:04<01:41,  2.42it/s]



Processing batches:  39%|███▉      | 158/402 [01:05<01:40,  2.42it/s]



Processing batches:  40%|███▉      | 159/402 [01:05<01:39,  2.44it/s]



Processing batches:  40%|███▉      | 160/402 [01:06<01:38,  2.46it/s]



Processing batches:  40%|████      | 161/402 [01:06<01:37,  2.47it/s]



Processing batches:  40%|████      | 162/402 [01:06<01:36,  2.48it/s]



Processing batches:  41%|████      | 163/402 [01:07<01:35,  2.49it/s]



Processing batches:  41%|████      | 164/402 [01:07<01:35,  2.50it/s]



Processing batches:  41%|████      | 165/402 [01:08<01:34,  2.51it/s]



Processing batches:  41%|████▏     | 166/402 [01:08<01:34,  2.49it/s]



Processing batches:  42%|████▏     | 167/402 [01:08<01:34,  2.48it/s]



Processing batches:  42%|████▏     | 168/402 [01:09<01:34,  2.48it/s]



Processing batches:  42%|████▏     | 169/402 [01:09<01:33,  2.48it/s]



Processing batches:  42%|████▏     | 170/402 [01:10<01:33,  2.48it/s]



Processing batches:  43%|████▎     | 171/402 [01:10<01:33,  2.47it/s]



Processing batches:  43%|████▎     | 172/402 [01:10<01:33,  2.46it/s]



Processing batches:  43%|████▎     | 173/402 [01:11<01:33,  2.46it/s]



Processing batches:  43%|████▎     | 174/402 [01:11<01:31,  2.48it/s]



Processing batches:  44%|████▎     | 175/402 [01:12<01:31,  2.48it/s]



Processing batches:  44%|████▍     | 176/402 [01:12<01:31,  2.48it/s]



Processing batches:  44%|████▍     | 177/402 [01:12<01:31,  2.45it/s]



Processing batches:  44%|████▍     | 178/402 [01:13<01:31,  2.44it/s]



Processing batches:  45%|████▍     | 179/402 [01:13<01:32,  2.42it/s]



Processing batches:  45%|████▍     | 180/402 [01:14<01:32,  2.41it/s]



Processing batches:  45%|████▌     | 181/402 [01:14<01:32,  2.40it/s]



Processing batches:  45%|████▌     | 182/402 [01:14<01:30,  2.42it/s]



Processing batches:  46%|████▌     | 183/402 [01:15<01:30,  2.43it/s]



Processing batches:  46%|████▌     | 184/402 [01:15<01:29,  2.44it/s]



Processing batches:  46%|████▌     | 185/402 [01:16<01:30,  2.40it/s]



Processing batches:  46%|████▋     | 186/402 [01:16<01:30,  2.40it/s]



Processing batches:  47%|████▋     | 187/402 [01:17<01:28,  2.42it/s]



Processing batches:  47%|████▋     | 188/402 [01:17<01:28,  2.42it/s]



Processing batches:  47%|████▋     | 189/402 [01:17<01:27,  2.43it/s]



Processing batches:  47%|████▋     | 190/402 [01:18<01:27,  2.42it/s]



Processing batches:  48%|████▊     | 191/402 [01:18<01:26,  2.43it/s]



Processing batches:  48%|████▊     | 192/402 [01:19<01:26,  2.43it/s]



Processing batches:  48%|████▊     | 193/402 [01:19<01:25,  2.43it/s]



Processing batches:  48%|████▊     | 194/402 [01:19<01:25,  2.45it/s]



Processing batches:  49%|████▊     | 195/402 [01:20<01:25,  2.43it/s]



Processing batches:  49%|████▉     | 196/402 [01:20<01:24,  2.43it/s]



Processing batches:  49%|████▉     | 197/402 [01:21<01:23,  2.45it/s]



Processing batches:  49%|████▉     | 198/402 [01:21<01:24,  2.42it/s]



Processing batches:  50%|████▉     | 199/402 [01:22<01:26,  2.35it/s]



Processing batches:  50%|████▉     | 200/402 [01:22<01:25,  2.36it/s]



Processing batches:  50%|█████     | 201/402 [01:22<01:24,  2.38it/s]



Processing batches:  50%|█████     | 202/402 [01:23<01:24,  2.37it/s]



Processing batches:  50%|█████     | 203/402 [01:23<01:23,  2.39it/s]



Processing batches:  51%|█████     | 204/402 [01:24<01:22,  2.41it/s]



Processing batches:  51%|█████     | 205/402 [01:24<01:22,  2.37it/s]



Processing batches:  51%|█████     | 206/402 [01:24<01:23,  2.36it/s]



Processing batches:  51%|█████▏    | 207/402 [01:25<01:21,  2.38it/s]



Processing batches:  52%|█████▏    | 208/402 [01:25<01:21,  2.38it/s]



Processing batches:  52%|█████▏    | 209/402 [01:26<01:20,  2.38it/s]



Processing batches:  52%|█████▏    | 210/402 [01:26<01:21,  2.37it/s]



Processing batches:  52%|█████▏    | 211/402 [01:27<01:21,  2.35it/s]



Processing batches:  53%|█████▎    | 212/402 [01:27<01:20,  2.37it/s]



Processing batches:  53%|█████▎    | 213/402 [01:27<01:19,  2.36it/s]



Processing batches:  53%|█████▎    | 214/402 [01:28<01:20,  2.34it/s]



Processing batches:  53%|█████▎    | 215/402 [01:28<01:19,  2.35it/s]



Processing batches:  54%|█████▎    | 216/402 [01:29<01:18,  2.37it/s]



Processing batches:  54%|█████▍    | 217/402 [01:29<01:17,  2.39it/s]



Processing batches:  54%|█████▍    | 218/402 [01:29<01:16,  2.42it/s]



Processing batches:  54%|█████▍    | 219/402 [01:30<01:14,  2.45it/s]



Processing batches:  55%|█████▍    | 220/402 [01:30<01:13,  2.48it/s]



Processing batches:  55%|█████▍    | 221/402 [01:31<01:13,  2.46it/s]



Processing batches:  55%|█████▌    | 222/402 [01:31<01:13,  2.45it/s]



Processing batches:  55%|█████▌    | 223/402 [01:32<01:13,  2.45it/s]



Processing batches:  56%|█████▌    | 224/402 [01:32<01:12,  2.45it/s]



Processing batches:  56%|█████▌    | 225/402 [01:32<01:12,  2.44it/s]



Processing batches:  56%|█████▌    | 226/402 [01:33<01:11,  2.46it/s]



Processing batches:  56%|█████▋    | 227/402 [01:33<01:10,  2.47it/s]



Processing batches:  57%|█████▋    | 228/402 [01:34<01:10,  2.47it/s]



Processing batches:  57%|█████▋    | 229/402 [01:34<01:09,  2.47it/s]



Processing batches:  57%|█████▋    | 230/402 [01:34<01:09,  2.46it/s]



Processing batches:  57%|█████▋    | 231/402 [01:35<01:09,  2.46it/s]



Processing batches:  58%|█████▊    | 232/402 [01:35<01:09,  2.44it/s]



Processing batches:  58%|█████▊    | 233/402 [01:36<01:09,  2.44it/s]



Processing batches:  58%|█████▊    | 234/402 [01:36<01:08,  2.44it/s]



Processing batches:  58%|█████▊    | 235/402 [01:36<01:07,  2.46it/s]



Processing batches:  59%|█████▊    | 236/402 [01:37<01:07,  2.46it/s]



Processing batches:  59%|█████▉    | 237/402 [01:37<01:07,  2.45it/s]



Processing batches:  59%|█████▉    | 238/402 [01:38<01:07,  2.44it/s]



Processing batches:  59%|█████▉    | 239/402 [01:38<01:06,  2.45it/s]



Processing batches:  60%|█████▉    | 240/402 [01:38<01:06,  2.42it/s]



Processing batches:  60%|█████▉    | 241/402 [01:39<01:06,  2.42it/s]



Processing batches:  60%|██████    | 242/402 [01:39<01:05,  2.44it/s]



Processing batches:  60%|██████    | 243/402 [01:40<01:05,  2.43it/s]



Processing batches:  61%|██████    | 244/402 [01:40<01:06,  2.38it/s]



Processing batches:  61%|██████    | 245/402 [01:41<01:06,  2.35it/s]



Processing batches:  61%|██████    | 246/402 [01:41<01:06,  2.36it/s]



Processing batches:  61%|██████▏   | 247/402 [01:41<01:05,  2.36it/s]



Processing batches:  62%|██████▏   | 248/402 [01:42<01:05,  2.36it/s]



Processing batches:  62%|██████▏   | 249/402 [01:42<01:04,  2.39it/s]



Processing batches:  62%|██████▏   | 250/402 [01:43<01:03,  2.41it/s]



Processing batches:  62%|██████▏   | 251/402 [01:43<01:02,  2.44it/s]



Processing batches:  63%|██████▎   | 252/402 [01:43<01:01,  2.45it/s]



Processing batches:  63%|██████▎   | 253/402 [01:44<01:00,  2.45it/s]



Processing batches:  63%|██████▎   | 254/402 [01:44<01:01,  2.43it/s]



Processing batches:  63%|██████▎   | 255/402 [01:45<01:01,  2.40it/s]



Processing batches:  64%|██████▎   | 256/402 [01:45<01:01,  2.39it/s]



Processing batches:  64%|██████▍   | 257/402 [01:46<01:00,  2.38it/s]



Processing batches:  64%|██████▍   | 258/402 [01:46<01:00,  2.39it/s]



Processing batches:  64%|██████▍   | 259/402 [01:46<01:00,  2.38it/s]



Processing batches:  65%|██████▍   | 260/402 [01:47<01:00,  2.36it/s]



Processing batches:  65%|██████▍   | 261/402 [01:47<00:59,  2.36it/s]



Processing batches:  65%|██████▌   | 262/402 [01:48<00:58,  2.39it/s]



Processing batches:  65%|██████▌   | 263/402 [01:48<00:58,  2.38it/s]



Processing batches:  66%|██████▌   | 264/402 [01:48<00:57,  2.40it/s]



Processing batches:  66%|██████▌   | 265/402 [01:49<00:57,  2.37it/s]



Processing batches:  66%|██████▌   | 266/402 [01:49<00:57,  2.35it/s]



Processing batches:  66%|██████▋   | 267/402 [01:50<00:57,  2.36it/s]



Processing batches:  67%|██████▋   | 268/402 [01:50<00:56,  2.38it/s]



Processing batches:  67%|██████▋   | 269/402 [01:51<00:55,  2.38it/s]



Processing batches:  67%|██████▋   | 270/402 [01:51<00:55,  2.38it/s]



Processing batches:  67%|██████▋   | 271/402 [01:51<00:54,  2.39it/s]



Processing batches:  68%|██████▊   | 272/402 [01:52<00:54,  2.39it/s]



Processing batches:  68%|██████▊   | 273/402 [01:52<00:53,  2.39it/s]



Processing batches:  68%|██████▊   | 274/402 [01:53<00:53,  2.41it/s]



Processing batches:  68%|██████▊   | 275/402 [01:53<00:53,  2.38it/s]



Processing batches:  69%|██████▊   | 276/402 [01:54<00:53,  2.37it/s]



Processing batches:  69%|██████▉   | 277/402 [01:54<00:52,  2.37it/s]



Processing batches:  69%|██████▉   | 278/402 [01:54<00:52,  2.37it/s]



Processing batches:  69%|██████▉   | 279/402 [01:55<00:51,  2.37it/s]



Processing batches:  70%|██████▉   | 280/402 [01:55<00:51,  2.37it/s]



Processing batches:  70%|██████▉   | 281/402 [01:56<00:50,  2.37it/s]



Processing batches:  70%|███████   | 282/402 [01:56<00:50,  2.39it/s]



Processing batches:  70%|███████   | 283/402 [01:56<00:49,  2.42it/s]



Processing batches:  71%|███████   | 284/402 [01:57<00:48,  2.44it/s]



Processing batches:  71%|███████   | 285/402 [01:57<00:47,  2.46it/s]



Processing batches:  71%|███████   | 286/402 [01:58<00:46,  2.48it/s]



Processing batches:  71%|███████▏  | 287/402 [01:58<00:46,  2.45it/s]



Processing batches:  72%|███████▏  | 288/402 [01:58<00:46,  2.45it/s]



Processing batches:  72%|███████▏  | 289/402 [01:59<00:46,  2.45it/s]



Processing batches:  72%|███████▏  | 290/402 [01:59<00:45,  2.44it/s]



Processing batches:  72%|███████▏  | 291/402 [02:00<00:45,  2.43it/s]



Processing batches:  73%|███████▎  | 292/402 [02:00<00:45,  2.44it/s]



Processing batches:  73%|███████▎  | 293/402 [02:01<00:44,  2.46it/s]



Processing batches:  73%|███████▎  | 294/402 [02:01<00:43,  2.46it/s]



Processing batches:  73%|███████▎  | 295/402 [02:01<00:43,  2.45it/s]



Processing batches:  74%|███████▎  | 296/402 [02:02<00:43,  2.45it/s]



Processing batches:  74%|███████▍  | 297/402 [02:02<00:42,  2.46it/s]



Processing batches:  74%|███████▍  | 298/402 [02:03<00:42,  2.44it/s]



Processing batches:  74%|███████▍  | 299/402 [02:03<00:42,  2.43it/s]



Processing batches:  75%|███████▍  | 300/402 [02:03<00:42,  2.43it/s]



Processing batches:  75%|███████▍  | 301/402 [02:04<00:41,  2.43it/s]



Processing batches:  75%|███████▌  | 302/402 [02:04<00:41,  2.43it/s]



Processing batches:  75%|███████▌  | 303/402 [02:05<00:40,  2.44it/s]



Processing batches:  76%|███████▌  | 304/402 [02:05<00:39,  2.46it/s]



Processing batches:  76%|███████▌  | 305/402 [02:05<00:39,  2.47it/s]



Processing batches:  76%|███████▌  | 306/402 [02:06<00:38,  2.47it/s]



Processing batches:  76%|███████▋  | 307/402 [02:06<00:38,  2.46it/s]



Processing batches:  77%|███████▋  | 308/402 [02:07<00:38,  2.45it/s]



Processing batches:  77%|███████▋  | 309/402 [02:07<00:38,  2.42it/s]



Processing batches:  77%|███████▋  | 310/402 [02:08<00:37,  2.43it/s]



Processing batches:  77%|███████▋  | 311/402 [02:08<00:37,  2.42it/s]



Processing batches:  78%|███████▊  | 312/402 [02:08<00:37,  2.42it/s]



Processing batches:  78%|███████▊  | 313/402 [02:09<00:36,  2.42it/s]



Processing batches:  78%|███████▊  | 314/402 [02:09<00:36,  2.44it/s]



Processing batches:  78%|███████▊  | 315/402 [02:10<00:35,  2.44it/s]



Processing batches:  79%|███████▊  | 316/402 [02:10<00:35,  2.44it/s]



Processing batches:  79%|███████▉  | 317/402 [02:10<00:34,  2.46it/s]



Processing batches:  79%|███████▉  | 318/402 [02:11<00:33,  2.47it/s]



Processing batches:  79%|███████▉  | 319/402 [02:11<00:33,  2.49it/s]



Processing batches:  80%|███████▉  | 320/402 [02:12<00:33,  2.46it/s]



Processing batches:  80%|███████▉  | 321/402 [02:12<00:33,  2.44it/s]



Processing batches:  80%|████████  | 322/402 [02:12<00:32,  2.44it/s]



Processing batches:  80%|████████  | 323/402 [02:13<00:32,  2.42it/s]



Processing batches:  81%|████████  | 324/402 [02:13<00:32,  2.42it/s]



Processing batches:  81%|████████  | 325/402 [02:14<00:31,  2.41it/s]



Processing batches:  81%|████████  | 326/402 [02:14<00:31,  2.43it/s]



Processing batches:  81%|████████▏ | 327/402 [02:14<00:30,  2.45it/s]



Processing batches:  82%|████████▏ | 328/402 [02:15<00:30,  2.45it/s]



Processing batches:  82%|████████▏ | 329/402 [02:15<00:29,  2.45it/s]



Processing batches:  82%|████████▏ | 330/402 [02:16<00:29,  2.48it/s]



Processing batches:  82%|████████▏ | 331/402 [02:16<00:28,  2.45it/s]



Processing batches:  83%|████████▎ | 332/402 [02:17<00:28,  2.44it/s]



Processing batches:  83%|████████▎ | 333/402 [02:17<00:28,  2.44it/s]



Processing batches:  83%|████████▎ | 334/402 [02:17<00:27,  2.43it/s]



Processing batches:  83%|████████▎ | 335/402 [02:18<00:27,  2.43it/s]



Processing batches:  84%|████████▎ | 336/402 [02:18<00:27,  2.41it/s]



Processing batches:  84%|████████▍ | 337/402 [02:19<00:27,  2.39it/s]



Processing batches:  84%|████████▍ | 338/402 [02:19<00:26,  2.41it/s]



Processing batches:  84%|████████▍ | 339/402 [02:19<00:25,  2.43it/s]



Processing batches:  85%|████████▍ | 340/402 [02:20<00:25,  2.44it/s]



Processing batches:  85%|████████▍ | 341/402 [02:20<00:24,  2.44it/s]



Processing batches:  85%|████████▌ | 342/402 [02:21<00:24,  2.44it/s]



Processing batches:  85%|████████▌ | 343/402 [02:21<00:24,  2.44it/s]



Processing batches:  86%|████████▌ | 344/402 [02:21<00:23,  2.43it/s]



Processing batches:  86%|████████▌ | 345/402 [02:22<00:23,  2.44it/s]



Processing batches:  86%|████████▌ | 346/402 [02:22<00:22,  2.44it/s]



Processing batches:  86%|████████▋ | 347/402 [02:23<00:22,  2.44it/s]



Processing batches:  87%|████████▋ | 348/402 [02:23<00:22,  2.43it/s]



Processing batches:  87%|████████▋ | 349/402 [02:24<00:21,  2.41it/s]



Processing batches:  87%|████████▋ | 350/402 [02:24<00:21,  2.41it/s]



Processing batches:  87%|████████▋ | 351/402 [02:24<00:20,  2.43it/s]



Processing batches:  88%|████████▊ | 352/402 [02:25<00:20,  2.45it/s]



Processing batches:  88%|████████▊ | 353/402 [02:25<00:20,  2.43it/s]



Processing batches:  88%|████████▊ | 354/402 [02:26<00:19,  2.43it/s]



Processing batches:  88%|████████▊ | 355/402 [02:26<00:19,  2.43it/s]



Processing batches:  89%|████████▊ | 356/402 [02:26<00:19,  2.42it/s]



Processing batches:  89%|████████▉ | 357/402 [02:27<00:18,  2.41it/s]



Processing batches:  89%|████████▉ | 358/402 [02:27<00:18,  2.41it/s]



Processing batches:  89%|████████▉ | 359/402 [02:28<00:17,  2.42it/s]



Processing batches:  90%|████████▉ | 360/402 [02:28<00:17,  2.42it/s]



Processing batches:  90%|████████▉ | 361/402 [02:28<00:16,  2.45it/s]



Processing batches:  90%|█████████ | 362/402 [02:29<00:16,  2.47it/s]



Processing batches:  90%|█████████ | 363/402 [02:29<00:15,  2.47it/s]



Processing batches:  91%|█████████ | 364/402 [02:30<00:15,  2.44it/s]



Processing batches:  91%|█████████ | 365/402 [02:30<00:15,  2.44it/s]



Processing batches:  91%|█████████ | 366/402 [02:30<00:14,  2.43it/s]



Processing batches:  91%|█████████▏| 367/402 [02:31<00:14,  2.43it/s]



Processing batches:  92%|█████████▏| 368/402 [02:31<00:13,  2.43it/s]



Processing batches:  92%|█████████▏| 369/402 [02:32<00:13,  2.44it/s]



Processing batches:  92%|█████████▏| 370/402 [02:32<00:13,  2.41it/s]



Processing batches:  92%|█████████▏| 371/402 [02:33<00:12,  2.42it/s]



Processing batches:  93%|█████████▎| 372/402 [02:33<00:12,  2.45it/s]



Processing batches:  93%|█████████▎| 373/402 [02:33<00:11,  2.45it/s]



Processing batches:  93%|█████████▎| 374/402 [02:34<00:11,  2.47it/s]



Processing batches:  93%|█████████▎| 375/402 [02:34<00:11,  2.45it/s]



Processing batches:  94%|█████████▎| 376/402 [02:35<00:10,  2.43it/s]



Processing batches:  94%|█████████▍| 377/402 [02:35<00:10,  2.44it/s]



Processing batches:  94%|█████████▍| 378/402 [02:35<00:09,  2.44it/s]



Processing batches:  94%|█████████▍| 379/402 [02:36<00:09,  2.43it/s]



Processing batches:  95%|█████████▍| 380/402 [02:36<00:09,  2.43it/s]



Processing batches:  95%|█████████▍| 381/402 [02:37<00:08,  2.42it/s]



Processing batches:  95%|█████████▌| 382/402 [02:37<00:08,  2.41it/s]



Processing batches:  95%|█████████▌| 383/402 [02:37<00:07,  2.42it/s]



Processing batches:  96%|█████████▌| 384/402 [02:38<00:07,  2.43it/s]



Processing batches:  96%|█████████▌| 385/402 [02:38<00:06,  2.44it/s]



Processing batches:  96%|█████████▌| 386/402 [02:39<00:06,  2.41it/s]



Processing batches:  96%|█████████▋| 387/402 [02:39<00:06,  2.41it/s]



Processing batches:  97%|█████████▋| 388/402 [02:40<00:05,  2.41it/s]



Processing batches:  97%|█████████▋| 389/402 [02:40<00:05,  2.41it/s]



Processing batches:  97%|█████████▋| 390/402 [02:40<00:04,  2.42it/s]



Processing batches:  97%|█████████▋| 391/402 [02:41<00:04,  2.42it/s]



Processing batches:  98%|█████████▊| 392/402 [02:41<00:04,  2.43it/s]



Processing batches:  98%|█████████▊| 393/402 [02:42<00:03,  2.43it/s]



Processing batches:  98%|█████████▊| 394/402 [02:42<00:03,  2.43it/s]



Processing batches:  98%|█████████▊| 395/402 [02:42<00:02,  2.45it/s]



Processing batches:  99%|█████████▊| 396/402 [02:43<00:02,  2.45it/s]



Processing batches:  99%|█████████▉| 397/402 [02:43<00:02,  2.43it/s]



Processing batches:  99%|█████████▉| 398/402 [02:44<00:01,  2.43it/s]



Processing batches:  99%|█████████▉| 399/402 [02:44<00:01,  2.42it/s]



Processing batches: 100%|█████████▉| 400/402 [02:45<00:00,  2.40it/s]



Processing batches: 100%|█████████▉| 401/402 [02:45<00:00,  2.37it/s]



Processing batches: 100%|██████████| 402/402 [02:45<00:00,  2.42it/s]
Processing batches:   0%|          | 0/119 [00:00<?, ?it/s]



Processing batches:   1%|          | 1/119 [00:00<00:48,  2.43it/s]



Processing batches:   2%|▏         | 2/119 [00:00<00:48,  2.44it/s]



Processing batches:   3%|▎         | 3/119 [00:01<00:47,  2.42it/s]



Processing batches:   3%|▎         | 4/119 [00:01<00:47,  2.40it/s]



Processing batches:   4%|▍         | 5/119 [00:02<00:47,  2.39it/s]



Processing batches:   5%|▌         | 6/119 [00:02<00:47,  2.36it/s]



Processing batches:   6%|▌         | 7/119 [00:02<00:47,  2.36it/s]



Processing batches:   7%|▋         | 8/119 [00:03<00:47,  2.36it/s]



Processing batches:   8%|▊         | 9/119 [00:03<00:46,  2.37it/s]



Processing batches:   8%|▊         | 10/119 [00:04<00:46,  2.36it/s]



Processing batches:   9%|▉         | 11/119 [00:04<00:45,  2.36it/s]



Processing batches:  10%|█         | 12/119 [00:05<00:45,  2.35it/s]



Processing batches:  11%|█         | 13/119 [00:05<00:45,  2.35it/s]



Processing batches:  12%|█▏        | 14/119 [00:05<00:44,  2.37it/s]



Processing batches:  13%|█▎        | 15/119 [00:06<00:43,  2.40it/s]



Processing batches:  13%|█▎        | 16/119 [00:06<00:42,  2.43it/s]



Processing batches:  14%|█▍        | 17/119 [00:07<00:41,  2.45it/s]



Processing batches:  15%|█▌        | 18/119 [00:07<00:40,  2.47it/s]



Processing batches:  16%|█▌        | 19/119 [00:07<00:41,  2.44it/s]



Processing batches:  17%|█▋        | 20/119 [00:08<00:40,  2.43it/s]



Processing batches:  18%|█▊        | 21/119 [00:08<00:40,  2.43it/s]



Processing batches:  18%|█▊        | 22/119 [00:09<00:39,  2.46it/s]



Processing batches:  19%|█▉        | 23/119 [00:09<00:39,  2.44it/s]



Processing batches:  20%|██        | 24/119 [00:09<00:38,  2.45it/s]



Processing batches:  21%|██        | 25/119 [00:10<00:38,  2.44it/s]



Processing batches:  22%|██▏       | 26/119 [00:10<00:38,  2.44it/s]



Processing batches:  23%|██▎       | 27/119 [00:11<00:37,  2.43it/s]



Processing batches:  24%|██▎       | 28/119 [00:11<00:37,  2.44it/s]



Processing batches:  24%|██▍       | 29/119 [00:12<00:37,  2.42it/s]



Processing batches:  25%|██▌       | 30/119 [00:12<00:37,  2.39it/s]



Processing batches:  26%|██▌       | 31/119 [00:12<00:36,  2.40it/s]



Processing batches:  27%|██▋       | 32/119 [00:13<00:36,  2.40it/s]



Processing batches:  28%|██▊       | 33/119 [00:13<00:35,  2.40it/s]



Processing batches:  29%|██▊       | 34/119 [00:14<00:35,  2.40it/s]



Processing batches:  29%|██▉       | 35/119 [00:14<00:35,  2.40it/s]



Processing batches:  30%|███       | 36/119 [00:14<00:34,  2.39it/s]



Processing batches:  31%|███       | 37/119 [00:15<00:34,  2.39it/s]



Processing batches:  32%|███▏      | 38/119 [00:15<00:33,  2.40it/s]



Processing batches:  33%|███▎      | 39/119 [00:16<00:32,  2.43it/s]



Processing batches:  34%|███▎      | 40/119 [00:16<00:32,  2.45it/s]



Processing batches:  34%|███▍      | 41/119 [00:17<00:32,  2.43it/s]



Processing batches:  35%|███▌      | 42/119 [00:17<00:31,  2.42it/s]



Processing batches:  36%|███▌      | 43/119 [00:17<00:31,  2.42it/s]



Processing batches:  37%|███▋      | 44/119 [00:18<00:31,  2.42it/s]



Processing batches:  38%|███▊      | 45/119 [00:18<00:30,  2.41it/s]



Processing batches:  39%|███▊      | 46/119 [00:19<00:30,  2.41it/s]



Processing batches:  39%|███▉      | 47/119 [00:19<00:29,  2.45it/s]



Processing batches:  40%|████      | 48/119 [00:19<00:28,  2.46it/s]



Processing batches:  41%|████      | 49/119 [00:20<00:28,  2.47it/s]



Processing batches:  42%|████▏     | 50/119 [00:20<00:27,  2.49it/s]



Processing batches:  43%|████▎     | 51/119 [00:21<00:27,  2.50it/s]



Processing batches:  44%|████▎     | 52/119 [00:21<00:27,  2.45it/s]



Processing batches:  45%|████▍     | 53/119 [00:22<00:29,  2.22it/s]



Processing batches:  45%|████▌     | 54/119 [00:22<00:28,  2.27it/s]



Processing batches:  46%|████▌     | 55/119 [00:22<00:27,  2.29it/s]



Processing batches:  47%|████▋     | 56/119 [00:23<00:27,  2.32it/s]



Processing batches:  48%|████▊     | 57/119 [00:23<00:26,  2.34it/s]



Processing batches:  49%|████▊     | 58/119 [00:24<00:25,  2.36it/s]



Processing batches:  50%|████▉     | 59/119 [00:24<00:25,  2.38it/s]



Processing batches:  50%|█████     | 60/119 [00:24<00:24,  2.39it/s]



Processing batches:  51%|█████▏    | 61/119 [00:25<00:24,  2.37it/s]



Processing batches:  52%|█████▏    | 62/119 [00:25<00:24,  2.37it/s]



Processing batches:  53%|█████▎    | 63/119 [00:26<00:23,  2.35it/s]



Processing batches:  54%|█████▍    | 64/119 [00:26<00:23,  2.34it/s]



Processing batches:  55%|█████▍    | 65/119 [00:27<00:22,  2.36it/s]



Processing batches:  55%|█████▌    | 66/119 [00:27<00:22,  2.36it/s]



Processing batches:  56%|█████▋    | 67/119 [00:27<00:22,  2.36it/s]



Processing batches:  57%|█████▋    | 68/119 [00:28<00:21,  2.35it/s]



Processing batches:  58%|█████▊    | 69/119 [00:28<00:21,  2.35it/s]



Processing batches:  59%|█████▉    | 70/119 [00:29<00:20,  2.37it/s]



Processing batches:  60%|█████▉    | 71/119 [00:29<00:20,  2.36it/s]



Processing batches:  61%|██████    | 72/119 [00:30<00:19,  2.37it/s]



Processing batches:  61%|██████▏   | 73/119 [00:30<00:19,  2.37it/s]



Processing batches:  62%|██████▏   | 74/119 [00:30<00:18,  2.38it/s]



Processing batches:  63%|██████▎   | 75/119 [00:31<00:18,  2.38it/s]



Processing batches:  64%|██████▍   | 76/119 [00:31<00:18,  2.37it/s]



Processing batches:  65%|██████▍   | 77/119 [00:32<00:18,  2.32it/s]



Processing batches:  66%|██████▌   | 78/119 [00:32<00:18,  2.28it/s]



Processing batches:  66%|██████▋   | 79/119 [00:33<00:17,  2.30it/s]



Processing batches:  67%|██████▋   | 80/119 [00:33<00:16,  2.33it/s]



Processing batches:  68%|██████▊   | 81/119 [00:33<00:16,  2.35it/s]



Processing batches:  69%|██████▉   | 82/119 [00:34<00:15,  2.31it/s]



Processing batches:  70%|██████▉   | 83/119 [00:34<00:15,  2.31it/s]



Processing batches:  71%|███████   | 84/119 [00:35<00:14,  2.34it/s]



Processing batches:  71%|███████▏  | 85/119 [00:35<00:14,  2.35it/s]



Processing batches:  72%|███████▏  | 86/119 [00:36<00:14,  2.32it/s]



Processing batches:  73%|███████▎  | 87/119 [00:36<00:13,  2.30it/s]



Processing batches:  74%|███████▍  | 88/119 [00:36<00:13,  2.27it/s]



Processing batches:  75%|███████▍  | 89/119 [00:37<00:13,  2.24it/s]



Processing batches:  76%|███████▌  | 90/119 [00:37<00:12,  2.23it/s]



Processing batches:  76%|███████▋  | 91/119 [00:38<00:12,  2.23it/s]



Processing batches:  77%|███████▋  | 92/119 [00:38<00:12,  2.23it/s]



Processing batches:  78%|███████▊  | 93/119 [00:39<00:11,  2.23it/s]



Processing batches:  79%|███████▉  | 94/119 [00:39<00:11,  2.24it/s]



Processing batches:  80%|███████▉  | 95/119 [00:40<00:10,  2.21it/s]



Processing batches:  81%|████████  | 96/119 [00:40<00:10,  2.20it/s]



Processing batches:  82%|████████▏ | 97/119 [00:41<00:09,  2.21it/s]



Processing batches:  82%|████████▏ | 98/119 [00:41<00:09,  2.21it/s]



Processing batches:  83%|████████▎ | 99/119 [00:41<00:09,  2.21it/s]



Processing batches:  84%|████████▍ | 100/119 [00:42<00:08,  2.20it/s]



Processing batches:  85%|████████▍ | 101/119 [00:42<00:08,  2.18it/s]



Processing batches:  86%|████████▌ | 102/119 [00:43<00:07,  2.16it/s]



Processing batches:  87%|████████▋ | 103/119 [00:43<00:07,  2.18it/s]



Processing batches:  87%|████████▋ | 104/119 [00:44<00:06,  2.17it/s]



Processing batches:  88%|████████▊ | 105/119 [00:44<00:06,  2.20it/s]



Processing batches:  89%|████████▉ | 106/119 [00:45<00:05,  2.24it/s]



Processing batches:  90%|████████▉ | 107/119 [00:45<00:05,  2.32it/s]



Processing batches:  91%|█████████ | 108/119 [00:45<00:04,  2.30it/s]



Processing batches:  92%|█████████▏| 109/119 [00:46<00:04,  2.27it/s]



Processing batches:  92%|█████████▏| 110/119 [00:46<00:03,  2.29it/s]



Processing batches:  93%|█████████▎| 111/119 [00:47<00:03,  2.29it/s]



Processing batches:  94%|█████████▍| 112/119 [00:47<00:03,  2.33it/s]



Processing batches:  95%|█████████▍| 113/119 [00:48<00:02,  2.35it/s]



Processing batches:  96%|█████████▌| 114/119 [00:48<00:02,  2.31it/s]



Processing batches:  97%|█████████▋| 115/119 [00:49<00:01,  2.34it/s]



Processing batches:  97%|█████████▋| 116/119 [00:49<00:01,  2.36it/s]



Processing batches:  98%|█████████▊| 117/119 [00:49<00:00,  2.36it/s]



Processing batches:  99%|█████████▉| 118/119 [00:50<00:00,  2.35it/s]



Processing batches: 100%|██████████| 119/119 [00:50<00:00,  2.35it/s]


In [21]:
img_train_features = pd.read_csv("train_img_features.csv")
img_test_features = pd.read_csv("test_img_features.csv")

In [23]:
# Merge with image features
merged_train_data = text_train_data.merge(img_train_features, on="PetID", how="left")
print(merged_train_data.head(1))

merged_test_data = text_test_data.merge(img_test_features, on="PetID", how="left")
print(merged_test_data.head(1))

       PetID                                        Description  \
0  d3b4f29f8  Mayleen and Flo are two lovely adorable sister...   

   AdoptionSpeed         0         1         2        3         4         5  \
0              2  0.000423  0.004769  0.003115  0.00386  0.083004  0.374298   

          6  ...      1014      1015      1016      1017      1018      1019  \
0  0.000869  ...  0.445891  0.227626  1.420346  0.654387  0.177218  1.166738   

       1020      1021      1022      1023  
0  0.971381  0.659021  0.245677  1.545983  

[1 rows x 1027 columns]
       PetID                                        Description         0  \
0  6697a7f62  This cute little puppy is looking for a loving...  0.000532   

          1         2         3         4         5         6         7  ...  \
0  0.004337  0.002847  0.002838  0.074986  0.262885  0.000559  0.001634  ...   

       1014      1015      1016      1017      1018      1019      1020  \
0  0.109452  0.107509  2.047832  0.653996