# Полный запуск решения

## Импорт библиотек

In [None]:
import os
import pickle
import yaml
from typing import List
from pathlib import Path

import pandas as pd
import numpy as np
from PIL import Image
from sentence_transformers import SentenceTransformer
from loguru import logger
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors

config_path = 'config.yaml'

## Получение Эмбедингов

In [None]:
def load_images_from_folder(output_folder: str) -> List[np.ndarray]:
    frame_files = sorted(os.listdir(output_folder))
    frames = []
    for frame_file in frame_files:
        frame_path = os.path.join(output_folder, frame_file)
        if os.path.isfile(frame_path) and frame_file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
            try:
                img = Image.open(frame_path)
                frames.append(np.array(img))
            except Exception as e:
                logger.eror(f"Error opening {frame_path}: {e}")
        else:
            logger.eror(f"Skipping directory or non-image file: {frame_path}")
    
    return frames

def save_embeddings(embeddings, filename, output_folder):
    output_path = Path(output_folder) / f"{filename}.pkl"
    with open(output_path, 'wb') as f:
        pickle.dump(embeddings, f)
    logger.info(f"Saved embeddings to {output_path}")

def vectorize_images(images: List[np.ndarray], model: SentenceTransformer) -> List[np.ndarray]:
    return [model.encode(Image.fromarray(img)) for img in tqdm(images)]

In [None]:
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)
logger.info("Loaded configuration from {}", config_path)

test_images = load_images_from_folder(config['test_images_folder'])
logger.info("Loaded test: {}", config['test_images_folder'])
train_images = load_images_from_folder(config['train_images_folder'])
logger.info("Loaded train: {}", config['train_images_folder'])

model = SentenceTransformer(config['model_name'])
logger.info("Loaded model: {}", config['model_name'])

test_embeddings = vectorize_images(test_images, model)
train_embeddings = vectorize_images(train_images, model)

save_embeddings(test_embeddings, 'test_emb', config['emb_output_folder'])
save_embeddings(train_embeddings, 'train_emb', config['emb_output_folder'])

logger.info("Saved embeddings for test and train images.")


## Посик похожих

In [None]:
def load_embeddings_from_folder(folder: str) -> tuple[List[np.ndarray], List[np.ndarray]]:
    test_embeddings = []
    train_embeddings = []
    for filename in os.listdir(folder):
        emb_path = os.path.join(folder, filename)

        if os.path.isfile(emb_path):
            with open(emb_path, 'rb') as f:
                embedding = pickle.load(f)

                if 'test' in filename.lower():
                    test_embeddings.append(embedding)
                elif 'train' in filename.lower():
                    train_embeddings.append(embedding)

    return test_embeddings, train_embeddings


def load_image_filenames(images_folder: str) -> List[str]:
    """Загружает имена файлов изображений из указанной папки."""
    image_filenames = []
    for filename in sorted(os.listdir(images_folder)):
        if filename.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif', 'bbox', 'txt')):  # Фильтруем по типу изображения
            image_filenames.append(filename)
    return image_filenames

def find_nearest_neighbors(test_embeddings: List[np.ndarray], 
                           train_embeddings: List[np.ndarray], 
                           n_neighbors: int, 
                           threshold: float) -> List[List[int]]:
    test_embeddings = np.array(test_embeddings)[0]
    train_embeddings = np.array(train_embeddings)[0]
    
    nn = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree')
    nn.fit(train_embeddings)  
    
    neighbors_indices = []
    for test_emb in test_embeddings:
        distances, indices = nn.kneighbors([test_emb])  
        valid_indices = [idx for dist, idx in zip(distances[0], indices[0]) if dist < threshold]
        
        if valid_indices:
            neighbors_indices.append(valid_indices[0])
        else:
            neighbors_indices.append(None)

    return neighbors_indices


def load_labels(labels_folder: str, file_extension: str, train_filenames: List[str]) -> List[str]:
    """Загружает метки из папки, фильтруя по расширению файла и проверяя, что имя файла присутствует в списке train_filenames."""
    labels = []
    train_filenames_base = [filename.split('.')[0] for filename in train_filenames]
    
    for filename in sorted(os.listdir(labels_folder)):
        if filename.split('.')[-1] == file_extension.lstrip('.') and filename.split('.')[0] in train_filenames_base:
            with open(os.path.join(labels_folder, filename), 'r') as file:
                # Читаем строки и добавляем `\n`, если его нет
                content = ''.join(line if line.endswith('\n') else line + '\n' for line in file.readlines())
                labels.append(content)
                
    return labels

In [None]:
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)
logger.info("Loaded configuration from {}", config_path)

test_embeddings, train_embeddings = load_embeddings_from_folder(config['emb_output_folder'])
logger.info("Embeddings were read")

test_filenames = load_image_filenames(config['test_images_folder'])
train_filenames = load_image_filenames(config['train_images_folder'])

train_labels = load_labels(config['train_labels_folder'], '.txt', train_filenames)
train_labels_with_text = load_labels(config['train_labels_with_text_folder'], '.bbox', train_filenames)
logger.info("train_labels and train_labels_with_text were read")

logger.info("Test image filenames were read")

n_neighbors = config['n_neighbors']
threshold = config['threshold']

nearest_neighbors = find_nearest_neighbors(test_embeddings, train_embeddings, n_neighbors, threshold)
logger.info(f"Neighbours were found - {nearest_neighbors}")
results = []
for test_idx, neighbors in enumerate(nearest_neighbors):
    if neighbors:
        neighbor_idx = neighbors 
        results.append([
            test_filenames[test_idx], 
            train_labels[neighbor_idx], 
            train_labels_with_text[neighbor_idx]
        ])
    else:
        results.append([
            test_filenames[test_idx],  
            None,  
            None   
        ])

# Создаем DataFrame и сохраняем в Excel
df = pd.DataFrame(results, columns=['Test_Embedding', 'Label', 'Label_With_Text'])
df.to_excel(config['output_excel'], index=False)
logger.info("Saved results to Excel: {}", config['output_excel'])
