# Imports & configuration

In [None]:
import json
import os
import logging
import time
import random
import re
import gc
import shutil
import warnings
from typing import Dict, List, Any, Callable, Set, Tuple
from collections import defaultdict
from functools import lru_cache
import pandas as pd
import numpy as np
import requests
import openml
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_fixed
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import HDBSCAN, KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sentence_transformers import SentenceTransformer
from gensim.models import Word2Vec
import chardet
import ipywidgets as widgets
from IPython.display import display, clear_output

warnings.filterwarnings('ignore')
os.environ["OMP_NUM_THREADS"] = "1"

In [None]:
# Логирование
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('dataset_processing.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Loading datasets from OpenML

In [None]:
OUTPUT_DIR = "all_datasets"
TAG_FILE = "unique_tags.json"
MAX_DATASETS = 100
MAX_CHECK_LIMIT = 1000
openml.config.apikey = "OPENML_API_KEY"
openml.config.cache_directory = OUTPUT_DIR

In [None]:
@retry(stop=stop_after_attempt(3), wait=wait_fixed(20))
def get_dataset_ids_with_tags():
    """Получение ID датасетов с тегами"""
    try:
        batch_size = 100
        tagged_ids = []
        total_pages = (MAX_CHECK_LIMIT + batch_size - 1) // batch_size 

        for page in range(total_pages):
            # Рассчёт лимит для последней партии
            current_limit = min(batch_size, MAX_CHECK_LIMIT - page * batch_size)
            
            datasets = openml.datasets.list_datasets(
                output_format="dataframe",
                offset=page * batch_size,
                limit=current_limit
            )

            if datasets.empty:
                break

            for did in tqdm(datasets['did'], desc=f"Проверка партии {page+1}/{total_pages}"):
                try:
                    dataset = openml.datasets.get_dataset(did, download_data=False)
                    tags = getattr(dataset, 'tag', []) or getattr(dataset, 'tag', []) or []
                    if tags:
                        tagged_ids.append(did)
                        if len(tagged_ids) >= MAX_DATASETS:
                            return tagged_ids
                    time.sleep(0.3)
                except Exception as e:
                    continue

            if len(tagged_ids) >= MAX_DATASETS:
                break

        return tagged_ids[:MAX_DATASETS]

    except Exception as e:
        logger.error(f"Ошибка получения ID: {str(e)}")
        return []

@retry(stop=stop_after_attempt(3), wait=wait_fixed(20))
def process_dataset(dataset_id):
    """Обработка отдельного датасета"""
    try:
        dataset = openml.datasets.get_dataset(dataset_id)
        data = dataset.get_data(dataset_format="dataframe")[0]

        # Получение тегов
        tags = getattr(dataset, 'tag', []) or getattr(dataset, 'tag', []) or []

        # Создание директории
        dataset_dir = os.path.join(OUTPUT_DIR, str(dataset_id))
        os.makedirs(dataset_dir, exist_ok=True)

        # Сохранение полного набора данных
        data.to_csv(os.path.join(dataset_dir, "full_dataset.csv"), index=False)

        # Метаданные
        metadata = {
            "id": dataset_id,
            "name": dataset.name,
            "tags": tags,
            "features": list(data.columns),
            "rows": len(data)
        }

        with open(os.path.join(dataset_dir, "metadata.json"), 'w') as f:
            json.dump(metadata, f, indent=2)

        return True, tags
    except Exception as e:
        logger.error(f"Ошибка обработки {dataset_id}: {str(e)}")
        return False, []

def download_datasets():
    """Основная функция загрузки датасетов"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    tagged_ids = get_dataset_ids_with_tags()

    if not tagged_ids:
        logger.error("Не найдено датасетов с тегами!")
        return

    if len(tagged_ids) < MAX_DATASETS:
        logger.warning(f"Найдено только {len(tagged_ids)} датасетов с тегами")

    unique_tags = set()
    success = 0

    for dataset_id in tqdm(tagged_ids[:MAX_DATASETS], desc="Обработка датасетов"):
        result, tags = process_dataset(dataset_id)
        if result:
            success += 1
            unique_tags.update(tags)
        time.sleep(1)

    with open(TAG_FILE, 'w') as f:
        json.dump(sorted(unique_tags), f, indent=2)

    logger.info(f"Успешно обработано: {success}/{len(tagged_ids[:MAX_DATASETS])}")
    logger.info(f"Уникальных тегов собрано: {len(unique_tags)}")

# Making dataset samples

In [None]:
OUTPUT_DIR = "all_datasets"
MAX_PROCESSING_COLUMNS = 50
MAX_CLUSTERING_ROWS = 1000  
SAMPLE_ROWS = 5
MAX_DATASET_SIZE = 100000  

In [None]:
def detect_encoding(file_path: str) -> str:
    try:
        with open(file_path, 'rb') as f:
            rawdata = f.read(50000)
        result = chardet.detect(rawdata)
        return result['encoding'] if result['confidence'] > 0.7 else 'utf-8'
    except Exception as e:
        logger.warning(f"Encoding detection error: {e}, using utf-8")
        return 'utf-8'

def read_csv_with_memory_limit(file_path: str, dataset_id: str = None) -> pd.DataFrame:
    try:
        encoding = detect_encoding(file_path)
        logger.info(f"Detected encoding: {encoding} for {os.path.basename(file_path)}")
        
        with open(file_path, 'r', encoding=encoding) as f:
            first_line = f.readline()
            column_names = first_line.strip().split(',')
        columns_to_use = column_names[:MAX_PROCESSING_COLUMNS]
        
        chunks = []
        chunk_size = 10000
        total_rows = 0
        
        for chunk in pd.read_csv(file_path, usecols=columns_to_use, encoding=encoding, chunksize=chunk_size, dtype='object'):
            chunks.append(chunk)
            total_rows += len(chunk)
            if total_rows > MAX_DATASET_SIZE:
                logger.warning(f"Reached max dataset size ({MAX_DATASET_SIZE} rows)")
                break
        
        if chunks:
            df = pd.concat(chunks, ignore_index=True)
            if len(df) > MAX_DATASET_SIZE:
                df = df.sample(n=MAX_DATASET_SIZE, random_state=42)
                logger.info(f"Sampled {MAX_DATASET_SIZE} rows from large dataset")
            logger.info(f"Read dataset with {len(df)} rows and {len(df.columns)} columns")
            return df
        else:
            logger.warning("No data read, returning empty DataFrame")
            return pd.DataFrame()
            
    except Exception as e:
        logger.error(f"Error reading CSV: {e}")
        return pd.DataFrame()

## Sample of diverse rows

In [None]:
def calculate_entropy(column: pd.Series) -> float:
    """Вычисление энтропии"""
    from collections import Counter
    from math import log2
    counts = Counter(column)
    total = len(column)
    entropy = 0.0
    for count in counts.values():
        p = count / total
        entropy -= p * log2(p) if p > 0 else 0
    return entropy

 
def select_diverse_rows(df: pd.DataFrame, n_rows: int = 5) -> pd.DataFrame:
    """Отбор разнообразных строк"""
    if df.empty:
        logger.warning("Empty DataFrame provided for row selection")
        return df
    if len(df) <= n_rows:
        logger.info(f"DataFrame small ({len(df)} rows), returning all rows")
        return df
    
    try:
        if len(df) > MAX_CLUSTERING_ROWS:
            df_sampled = df.sample(n=MAX_CLUSTERING_ROWS, random_state=42)
            logger.info(f"Downsampled to {MAX_CLUSTERING_ROWS} rows for entropy selection")
        else:
            df_sampled = df.copy()
        
        if len(df_sampled.columns) > MAX_PROCESSING_COLUMNS:
            columns_to_use = df_sampled.columns[:MAX_PROCESSING_COLUMNS]
            df_processed = df_sampled[columns_to_use]
            logger.info(f"Limited to {len(columns_to_use)} columns for selection")
        else:
            df_processed = df_sampled.copy()
        
        for col in df_processed.columns:
            if pd.api.types.is_numeric_dtype(df_processed[col]):
                median_val = df_processed[col].median()
                if pd.isna(median_val):
                    median_val = 0
                df_processed[col].fillna(median_val, inplace=True)
            else:
                mode_val = df_processed[col].mode()
                df_processed[col].fillna(mode_val[0] if not mode_val.empty else "missing", inplace=True)
        
        # Жадный алгоритм отбора строк
        selected_indices = []
        remaining_indices = set(df_processed.index)
        
        # Инициализация: выбираем строку с максимальной энтропией
        max_entropy = -1
        best_index = None
        for idx in df_processed.index:
            entropy_sum = sum(calculate_entropy(df_processed.loc[[idx], col]) for col in df_processed.columns)
            if entropy_sum > max_entropy:
                max_entropy = entropy_sum
                best_index = idx
        
        if best_index is None:
            return df_sampled.sample(n=min(n_rows, len(df_sampled)), random_state=42)
        
        selected_indices.append(best_index)
        remaining_indices.remove(best_index)
        
        # Последовательный выбор строк
        for i in range(1, n_rows):
            max_entropy_gain = -1
            best_candidate = None
            
            for candidate in remaining_indices:
                temp_set = selected_indices + [candidate]
                total_entropy = sum(calculate_entropy(df_processed.loc[temp_set, col]) for col in df_processed.columns)
                if total_entropy > max_entropy_gain:
                    max_entropy_gain = total_entropy
                    best_candidate = candidate
            
            if best_candidate is not None:
                selected_indices.append(best_candidate)
                remaining_indices.remove(best_candidate)
            else:
                best_candidate = next(iter(remaining_indices), None)
                if best_candidate is not None:
                    selected_indices.append(best_candidate)
                    remaining_indices.remove(best_candidate)
        
        logger.info(f"Selected {len(selected_indices)} diverse rows")
        return df_sampled.loc[selected_indices]
    
    except Exception as e:
        logger.error(f"Entropy-based selection error: {e}", exc_info=True)
        return df.sample(n=min(n_rows, len(df)), random_state=42)

def save_diverse_rows():
    """Сохранение сэмпла разнообразных строк датасета"""
    os.makedirs(RESULTS_DIR, exist_ok=True)

    # Получение списка датасетов
    dataset_ids = [d for d in os.listdir(OUTPUT_DIR) if os.path.isdir(os.path.join(OUTPUT_DIR, d))]
    valid_datasets = [
        d for d in dataset_ids
        if os.path.exists(os.path.join(OUTPUT_DIR, d, "metadata.json"))
        and os.path.exists(os.path.join(OUTPUT_DIR, d, "full_dataset.csv"))
    ]
    logger.info(f"Found {len(valid_datasets)} valid datasets")

    # Обработка каждого датасета
    for dataset_id in tqdm(valid_datasets, desc="Processing datasets", unit="dataset"):
        try:
            data_path = os.path.join(OUTPUT_DIR, dataset_id, "full_dataset.csv")
            meta_path = os.path.join(OUTPUT_DIR, dataset_id, "metadata.json")

            # Чтение метаданных
            with open(meta_path, "r", encoding="utf-8") as f:
                metadata = json.load(f)

            # Чтение полного датасета
            df = read_csv_with_memory_limit(data_path, dataset_id=dataset_id)
            if df.empty:
                logger.warning(f"Empty DataFrame for {dataset_id}, skipping")
                continue

            # Выбор 5 самых разнообразных строк
            diverse_df = select_diverse_rows(df, n_rows=SAMPLE_ROWS)
            if diverse_df.empty:
                logger.warning(f"No diverse rows selected for {dataset_id}, skipping")
                continue

            # Сохранение выбранных строк
            diverse_path = os.path.join(OUTPUT_DIR, dataset_id, "diverse_rows.csv")
            diverse_df.to_csv(diverse_path, index=False)
            logger.info(f"Saved {len(diverse_df)} diverse rows for dataset {dataset_id}")

            # Обновление метаданных
            metadata["diverse_rows_file"] = "diverse_rows.csv"
            with open(meta_path, "w", encoding="utf-8") as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)

            # Очистка памяти
            del df, diverse_df
            gc.collect()

        except Exception as e:
            logger.error(f"Error processing dataset {dataset_id}: {e}", exc_info=True)
            continue

    logger.info(f"Completed processing {len(valid_datasets)} datasets")

## Sample of random rows

In [None]:
def select_random_rows(df: pd.DataFrame, n_rows: int = 5) -> pd.DataFrame:
    """Отбор случайных строк датасета"""
    if df.empty:
        return df
    n = min(n_rows, len(df))
    return df.sample(n=n, random_state=42)

def save_random_rows():
    """Сохранение сэмпла случайных строк датасета"""
    os.makedirs(RESULTS_DIR, exist_ok=True)
    dataset_ids = [d for d in os.listdir(OUTPUT_DIR) 
                 if os.path.isdir(os.path.join(OUTPUT_DIR, d))]
    
    valid_datasets = [
        d for d in dataset_ids
        if os.path.exists(os.path.join(OUTPUT_DIR, d, "metadata.json"))
        and os.path.exists(os.path.join(OUTPUT_DIR, d, "full_dataset.csv"))
    ]
    logger.info(f"Found {len(valid_datasets)} valid datasets")

    for dataset_id in tqdm(valid_datasets, desc="Processing datasets"):
        try:
            data_path = os.path.join(OUTPUT_DIR, dataset_id, "full_dataset.csv")
            meta_path = os.path.join(OUTPUT_DIR, dataset_id, "metadata.json")

            with open(meta_path, "r", encoding="utf-8") as f:
                metadata = json.load(f)

            df = read_csv_with_memory_limit(data_path)
            if df.empty:
                logger.warning(f"Empty DataFrame for {dataset_id}, skipping")
                continue

            # Выбор случайных строк 
            random_df = select_random_rows(df, n_rows=SAMPLE_ROWS)
            if random_df.empty:
                logger.warning(f"No rows selected for {dataset_id}, skipping")
                continue

            random_path = os.path.join(OUTPUT_DIR, dataset_id, "random_rows.csv")
            random_df.to_csv(random_path, index=False)
            logger.info(f"Saved {len(random_df)} random rows for {dataset_id}")

            metadata["random_rows_file"] = "random_rows.csv"  
            with open(meta_path, "w", encoding="utf-8") as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)

            del df, random_df
            gc.collect()

        except Exception as e:
            logger.error(f"Error processing {dataset_id}: {e}")
            continue

    logger.info(f"Completed processing {len(valid_datasets)} datasets")

# Creating domains and mapping datatsets

In [None]:
INPUT_DIR = "all_datasets"
OUTPUT_DIR = "clustering_results"
DOMAIN_MAPPING_FILE = "clustering_domain_mapping.json"

In [None]:
CLUSTERING_METHODS = {
    "hdbscan": {
        "class": HDBSCAN,
        "params": {
            "min_cluster_size": 3,
            "metric": "cosine",
            "cluster_selection_method": "leaf"
        }
    },
    "kmeans": {
        "class": KMeans,
        "params": {
            "n_clusters": 8,
            "random_state": 42
        }
    },
    "agglomerative": {
        "class": AgglomerativeClustering,
        "params": {
            "n_clusters": None,
            "distance_threshold": 0.6,
            "linkage": "average",
            "metric": "cosine"
        }
    },
    "dbscan": {
        "class": DBSCAN,
        "params": {
            "eps": 0.5,
            "min_samples": 3,
            "metric": "cosine"
        }
    }
}

EMBEDDING_METHODS = {
    "sentence_transformers": {
        "model": "all-mpnet-base-v2",
        "dim": 768
    },
    "tfidf": {
        "max_features": 500,
        "dim": 500
    },
    "gensim_word2vec": {
        "vector_size": 100,
        "window": 5,
        "min_count": 1,
        "workers": 4,
        "dim": 100
    }
}

STOP_PATTERNS = {
    r'^study_?\d+', r'test', r'temp', r'demo', r'^kaggle$', r'^uci$',
    r'example', r'azure', r'tutorial', r'^data$',
    r'_', r'\d+$', r'mythbusting'
}

def normalize_tag(tag: str, apply_filtering: bool = True) -> str:
    """Нормализация тегов"""
    if not isinstance(tag, str) or not tag.strip():
        return None
    
    tag = tag.lower().strip()
    tag = re.sub(r'[^\w\s-]', '', tag)  
    normalized = re.sub(r'[\s_]+', ' ', tag) 
    
    if apply_filtering:
        if len(normalized) < 3 or any(re.search(p, normalized) for p in STOP_PATTERNS):
            return None
    return normalized

def filter_tags(tags: List[str]) -> List[str]:
    """Фильтрация списка тегов по правилам нормализации с удалением дубликатов"""
    unique_tags = set()
    for tag in tags:
        normalized = normalize_tag(tag)
        if normalized:
            unique_tags.add(normalized)
    return list(unique_tags)

def load_and_filter_tags():
    """Загрузка и фильтрация тегов с удалением дубликатов"""
    with open(TAG_FILE, 'r') as f:
        tags = json.load(f)
    
    filtered = filter_tags(tags)
    return filtered

def cache_datasets():
    """Кэширование списка датасетов"""
    datasets = openml.datasets.list_datasets(output_format="dataframe")
    cache_file = "datasets_cache.json"
    with open(cache_file, 'w', encoding='utf-8') as f:
        datasets.to_json(f, orient="records")
    logger.info(f"Datasets cached to {cache_file}")
    return datasets

def load_cached_datasets():
    """Загрузка кэшированного списка датасетов"""
    cache_file = "datasets_cache.json"
    if os.path.exists(cache_file):
        with open(cache_file, 'r', encoding='utf-8') as f:
            return pd.read_json(f, orient="records")
    return cache_datasets()

def load_cached_datasets_by_tag(tag: str) -> List[Dict]:
    """Кэширование и загрузка датасетов по конкретному тегу"""
    cache_file = f"datasets_cache_{tag}.json"
    if os.path.exists(cache_file):
        with open(cache_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    logger.info(f"Fetching datasets for tag '{tag}' from OpenML...")
    datasets = openml.datasets.list_datasets(tag=tag, output_format='dataframe')
    datasets = datasets[['did']].to_dict(orient='records')
    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump(datasets, f, indent=2, ensure_ascii=False)
    logger.info(f"Datasets for tag '{tag}' cached to {cache_file}")
    return datasets

In [None]:
# Дополнительные функции для кластеризации и формирования доменов
def enrich_small_domains(
    domain_distribution: Dict[str, int],
    domain_tags: Dict[str, List[str]],
    min_datasets: int = 5,
    max_new_datasets_per_domain: int = 10,
    output_dir: str = OUTPUT_DIR_DATASETS
) -> None:
    """
    Поиск и добавление датасетов для доменов с малым количеством датасетов, с фильтрацией по тегам.

    Args:
        domain_distribution: Словарь с распределением датасетов по доменам {домен: кол-во}.
        domain_tags: Словарь с тегами для каждого домена {домен: [теги]}.
        min_datasets: Минимальное количество датасетов, ниже которого домен считается малочисленным.
        max_new_datasets_per_domain: Максимальное количество новых датасетов для одного домена.
        output_dir: Директория для сохранения датасетов.
    """
    logger = logging.getLogger(__name__)
    logger.info("Starting enrichment of small domains")

    small_domains = {domain: count for domain, count in domain_distribution.items() if count < min_datasets}
    logger.info(f"Found {len(small_domains)} small domains: {small_domains}")

    if not small_domains:
        logger.info("No small domains to enrich")
        return

    processed_ids = set(os.listdir(output_dir))
    logger.info(f"Found {len(processed_ids)} already processed datasets")

    for domain, current_count in small_domains.items():
        logger.info(f"Enriching domain '{domain}' (current datasets: {current_count})")
        target_tags = set([normalize_tag(tag, apply_filtering=False) for tag in domain_tags.get(domain, []) if normalize_tag(tag, apply_filtering=False)])
        if not target_tags:
            logger.warning(f"No tags found for domain '{domain}', skipping")
            continue

        # Запрашиваем датасеты, отфильтрованные по тегам домена
        potential_datasets = []
        dataset_ids = set()
        for tag in target_tags:
            try:
                # Используем кэширование для запросов по тегу
                datasets = load_cached_datasets_by_tag(tag)
                for record in datasets:
                    did = record['did']
                    if str(did) not in processed_ids and did not in dataset_ids:
                        dataset_ids.add(did)
                        potential_datasets.append((did, 1))  # Начальный вес = 1
                time.sleep(0.05) 
            except Exception as e:
                logger.error(f"Error fetching datasets for tag '{tag}': {str(e)}")
                continue

        logger.info(f"Found {len(potential_datasets)} potential datasets for '{domain}'")

        # Проверяем пересечение тегов для приоритизации
        ranked_datasets = []
        for did, _ in tqdm(potential_datasets, desc=f"Ranking datasets for domain '{domain}'"):
            try:
                dataset = openml.datasets.get_dataset(did, download_data=False)
                tags = getattr(dataset, 'tag', []) or []
                if not isinstance(tags, (list, tuple)):
                    logger.warning(f"Dataset {did} has invalid tags: {tags}, skipping")
                    continue
                tags = set([normalize_tag(tag, apply_filtering=False) for tag in tags if normalize_tag(tag, apply_filtering=False)])
                overlap = len(tags & target_tags)
                if overlap > 0:
                    ranked_datasets.append((did, overlap))
                time.sleep(0.05) 
            except Exception as e:
                logger.error(f"Error checking dataset {did}: {str(e)}")
                continue

        # Сортируем по количеству совпадающих тегов и выбираем до max_new_datasets_per_domain
        ranked_datasets.sort(key=lambda x: x[1], reverse=True)
        selected_datasets = [did for did, _ in ranked_datasets[:max_new_datasets_per_domain]]
        logger.info(f"Selected {len(selected_datasets)} datasets for '{domain}'")

        new_datasets_added = 0
        for dataset_id in tqdm(selected_datasets, desc=f"Processing datasets for '{domain}'"):
            try:
                result, tags = process_dataset(dataset_id)
                if result:
                    new_datasets_added += 1
                    processed_ids.add(str(dataset_id))
                    meta_path = os.path.join(output_dir, str(dataset_id), "metadata.json")
                    with open(meta_path, 'r+', encoding='utf-8') as f:
                        metadata = json.load(f)
                        metadata['domain'] = domain
                        metadata['cleaned_tags'] = filter_tags(tags)
                        f.seek(0)
                        json.dump(metadata, f, indent=2, ensure_ascii=False)
                        f.truncate()
                    logger.info(f"Added dataset {dataset_id} to domain '{domain}'")
                time.sleep(0.5)  
            except Exception as e:
                logger.error(f"Failed to process dataset {dataset_id}: {str(e)}")
                continue

        logger.info(f"Added {new_datasets_added} new datasets to domain '{domain}'")

    logger.info("Enrichment completed")

def merge_small_clusters(clusters: Dict[int, List[str]], embeddings: np.ndarray, min_cluster_size: int, merge_threshold: float) -> Dict[int, List[str]]:
    """Объединение маленьких кластеров с семантически близкими крупными кластерами на основе косинусного расстояния"""
    logger.info(f"Initial number of clusters: {len(clusters)}")
    logger.info(f"Cluster sizes before merging: { {k: len(v) for k, v in clusters.items()} }")
    cluster_centroids = {}
    tag_to_index = {tag: idx for idx, tag in enumerate(load_and_filter_tags())}
    for cluster_id, tags in clusters.items():
        tag_indices = [tag_to_index[tag] for tag in tags if tag in tag_to_index]
        if tag_indices:
            cluster_centroids[cluster_id] = np.mean(embeddings[tag_indices], axis=0)
        else:
            cluster_centroids[cluster_id] = np.zeros(embeddings.shape[1])

    small_clusters = {k: v for k, v in clusters.items() if len(v) < min_cluster_size}
    large_clusters = {k: v for k, v in clusters.items() if len(v) >= min_cluster_size}
    logger.info(f"Small clusters (<{min_cluster_size} tags): {len(small_clusters)}")
    logger.info(f"Large clusters (>= {min_cluster_size} tags): {len(large_clusters)}")
    
    if not small_clusters or not large_clusters:
        logger.info("No merging needed: either no small clusters or no large clusters.")
        return clusters

    def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
        if np.all(a == 0) or np.all(b == 0):
            return 1.0
        dot_product = np.dot(a, b)
        norm_a = np.linalg.norm(a)
        norm_b = np.linalg.norm(b)
        return 1 - dot_product / (norm_a * norm_b)

    merged_clusters = large_clusters.copy()
    merged_count = 0
    for small_id, small_tags in small_clusters.items():
        min_distance = float('inf')
        nearest_cluster_id = None
        for large_id, _ in large_clusters.items():
            distance = cosine_distance(cluster_centroids[small_id], cluster_centroids[large_id])
            if distance < min_distance and distance <= merge_threshold:
                min_distance = distance
                nearest_cluster_id = large_id
        if nearest_cluster_id is not None:
            merged_clusters[nearest_cluster_id].extend(small_tags)
            merged_count += 1
        else:
            merged_clusters[small_id] = small_tags
    logger.info(f"Merged {merged_count} small clusters into large ones.")

    final_clusters = {}
    for idx, (old_id, tags) in enumerate(merged_clusters.items()):
        final_clusters[idx] = list(set(tags))
    logger.info(f"Final number of clusters after merging: {len(final_clusters)}")

    return final_clusters

In [None]:
# Классы для клатсеризации
class EmbeddingGenerator:
    """Генерация векторных представлений для тегов"""
    def __init__(self, method: str):
        self.method = method
        self.vectorizer = None
        self.w2v_model = None

    def generate(self, tags: List[str]) -> np.ndarray:
        if self.method == "sentence_transformers":
            return self._sentence_transformers(tags)
        elif self.method == "tfidf":
            return self._tfidf(tags)
        elif self.method == "gensim_word2vec":
            return self._gensim_word2vec(tags)
        else:
            raise ValueError(f"Unknown method: {self.method}")

    def _sentence_transformers(self, tags):
        model = SentenceTransformer(EMBEDDING_METHODS[self.method]["model"])
        return model.encode(tags, convert_to_tensor=True).cpu().numpy()

    def _tfidf(self, tags):
        self.vectorizer = TfidfVectorizer(
            max_features=EMBEDDING_METHODS[self.method]["max_features"],
            tokenizer=lambda x: x.split(),
            token_pattern=None
        )
        return self.vectorizer.fit_transform(tags).toarray()

    def _gensim_word2vec(self, tags):
        tokenized_tags = [tag.split() for tag in tags]
        
        self.w2v_model = Word2Vec(
            sentences=tokenized_tags,
            vector_size=EMBEDDING_METHODS["gensim_word2vec"]["vector_size"],
            window=EMBEDDING_METHODS["gensim_word2vec"]["window"],
            min_count=EMBEDDING_METHODS["gensim_word2vec"]["min_count"],
            workers=EMBEDDING_METHODS["gensim_word2vec"]["workers"]
        )
        
        embeddings = []
        for tag in tags:
            vectors = []
            for word in tag.split():
                if word in self.w2v_model.wv:
                    vectors.append(self.w2v_model.wv[word])
            if vectors:
                embeddings.append(np.mean(vectors, axis=0))
            else:
                embeddings.append(np.zeros(
                    EMBEDDING_METHODS["gensim_word2vec"]["dim"]
                ))
        return np.array(embeddings)

class ClusterAnalyzer:
    """Анализ и кластеризация тегов"""
    def __init__(self, tags: List[str], embedding_method: str):
        self.tags = tags
        self.embedding_method = embedding_method
        self.embeddings = None
        self.results = {}

    def generate_embeddings(self):
        generator = EmbeddingGenerator(self.embedding_method)
        self.embeddings = generator.generate(self.tags)

    @staticmethod
    def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
        """Вычисляет косинусное расстояние между двумя векторами"""
        if np.all(a == 0) or np.all(b == 0):
            return 1.0
        dot_product = np.dot(a, b)
        norm_a = np.linalg.norm(a)
        norm_b = np.linalg.norm(b)
        return 1 - dot_product / (norm_a * norm_b)

    def cluster_tags(self, method: str, min_cluster_size: int, merge_threshold: float) -> Dict:
        config = CLUSTERING_METHODS[method]
        
        if method == "kmeans" and len(self.tags) < config["params"]["n_clusters"]:
            raise ValueError(f"Not enough samples ({len(self.tags)}) for {method}")
            
        clusterer = config["class"](**config["params"])
        labels = clusterer.fit_predict(self.embeddings)
        
        valid_labels = set()
        cluster_counts = {}
        for label in set(labels):
            if label == -1:
                continue
            count = np.sum(labels == label)
            cluster_counts[label] = count
            if count >= 2:
                valid_labels.add(label)
        
        if not valid_labels:
            labels = np.zeros(len(labels), dtype=labels.dtype)
            cluster_centroids = {0: np.mean(self.embeddings, axis=0)}
            valid_labels = {0}
        else:
            cluster_centroids = {}
            for label in valid_labels:
                cluster_indices = np.where(labels == label)[0]
                cluster_embeddings = self.embeddings[cluster_indices]
                cluster_centroids[label] = np.mean(cluster_embeddings, axis=0)
            
            for idx, label in enumerate(labels):
                if label == -1 or (label != -1 and label not in valid_labels):
                    point_embedding = self.embeddings[idx]
                    min_distance = float('inf')
                    nearest_cluster = -1
                    
                    for cluster_label, centroid in cluster_centroids.items():
                        distance = self.cosine_distance(point_embedding, centroid)
                        if distance < min_distance:
                            min_distance = distance
                            nearest_cluster = cluster_label
                    
                    if nearest_cluster != -1:
                        labels[idx] = nearest_cluster
                    else:
                        labels[idx] = next(iter(valid_labels))
        
        clusters = defaultdict(list)
        for idx, label in enumerate(labels):
            label_int = int(label)
            clusters[label_int].append(self.tags[idx])
        
        logger.info(f"Cluster sizes before merging: { {k: len(v) for k, v in clusters.items()} }")
        clusters = merge_small_clusters(dict(clusters), self.embeddings, min_cluster_size, merge_threshold)
        
        metrics = self._calculate_metrics(labels)
        
        return {
            "clusters": clusters,
            "metrics": metrics,
            "params": config["params"]
        }

    def _calculate_metrics(self, labels):
        """Вычисление метрик качества кластеризации"""
        unique_labels = len(set(labels))
        metrics = {}
        
        if unique_labels > 1:
            try:
                metrics["silhouette"] = silhouette_score(self.embeddings, labels)
                metrics["davies_bouldin"] = davies_bouldin_score(self.embeddings, labels)
            except Exception as e:
                logger.error(f"Metric calculation failed: {str(e)}")
        
        metrics["n_clusters"] = unique_labels - (1 if -1 in labels else 0)
        return metrics

In [None]:
# Функции для разметки датасетов и сохранения результатов
def manual_annotate_clusters(clusters: Dict[int, List[str]], callback: Callable[[Dict], None]):
    """Интерактивная разметка кластеров пользователем с использованием виджетов"""
    annotated_domains = {}
    sorted_clusters = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)
    
    output_area = widgets.Output()
    result_container = widgets.Output()
    display(output_area, result_container)
    
    domain_input = widgets.Text(
        value='',
        placeholder='Введите имя домена',
        description='Имя домена:',
        layout=widgets.Layout(width='500px')
    )
    
    submit_button = widgets.Button(description="Подтвердить", button_style='success')
    skip_button = widgets.Button(description="Пропустить", button_style='')
    progress = widgets.IntProgress(
        value=0,
        min=0,
        max=len(sorted_clusters),
        description='Прогресс:',
        bar_style='info',
        style={'bar_color': '#4CAF50'}
    )
    
    buttons_box = widgets.HBox([domain_input, submit_button, skip_button])
    display(buttons_box, progress)
    
    current_idx = [0]
    
    def show_cluster():
        with output_area:
            output_area.clear_output()
            if current_idx[0] < len(sorted_clusters):
                cluster_id, tags = sorted_clusters[current_idx[0]]
                unique_tags = list(set(tags))
                print(f"Кластер #{cluster_id} ({len(unique_tags)} уникальных тегов):")
                print("-"*50)
                for i, tag in enumerate(unique_tags[:10]):
                    print(f"{i+1}. {tag}")
                if len(unique_tags) > 10:
                    print(f"... и еще {len(unique_tags)-10} тегов")
            else:
                print("Все кластеры обработаны!")
                submit_button.disabled = True
                skip_button.disabled = True
                domain_input.disabled = True
                callback(annotated_domains)
    
    def handle_submit(b):
        domain = domain_input.value.strip()
        cluster_id, tags = sorted_clusters[current_idx[0]]
        
        if domain and domain.lower() != 'skip':
            unique_tags = list(set(tags))
            annotated_domains[domain] = unique_tags
            with result_container:
                result_container.clear_output()
                print(f"Кластер #{cluster_id} сохранен как: {domain}")
        else:
            with result_container:
                result_container.clear_output()
                print(f"Кластер #{cluster_id} пропущен")
        
        domain_input.value = ''
        current_idx[0] += 1
        progress.value = current_idx[0]
        show_cluster()

    def handle_skip(b):
        cluster_id, tags = sorted_clusters[current_idx[0]]
        
        with result_container:
            result_container.clear_output()
            print(f"Кластер #{cluster_id} пропущен")
        
        domain_input.value = ''
        current_idx[0] += 1
        progress.value = current_idx[0]
        show_cluster()

    submit_button.on_click(handle_submit)
    skip_button.on_click(handle_skip)
    
    show_cluster()

def assign_unused_domains(final_domains: Dict, input_dir: str = INPUT_DIR):
    """Гарантия того, что каждый домен будет назначен хотя бы одному датасету"""
    logger.info("Checking for unused domains")
    
    dataset_info = {}
    used_domains = set()
    
    for dataset_id in os.listdir(input_dir):
        meta_path = os.path.join(input_dir, dataset_id, "metadata.json")
        if not os.path.exists(meta_path):
            logger.warning(f"Metadata file not found for dataset {dataset_id}")
            continue
            
        try:
            with open(meta_path, 'r', encoding='utf-8') as f:
                metadata = json.load(f)
                tags = set(metadata.get('cleaned_tags', []))
                domain = metadata.get('domain', 'Unknown')
                dataset_info[dataset_id] = {'tags': tags, 'domain': domain}
                used_domains.add(domain)
        except Exception as e:
            logger.error(f"Failed to read metadata for dataset {dataset_id}: {str(e)}")
            continue
    
    all_domains = set(final_domains.keys())
    unused_domains = all_domains - used_domains
    logger.info(f"Found {len(unused_domains)} unused domains: {unused_domains}")
    
    if not unused_domains:
        logger.info("All domains are used")
        return
    
    assignment_report = []
    
    for domain in unused_domains:
        domain_tags = set([normalize_tag(t, apply_filtering=False) for t in final_domains[domain] if normalize_tag(t, apply_filtering=False)])
        logger.info(f"Processing unused domain '{domain}' with tags: {domain_tags}")
        best_dataset = None
        max_overlap = 0
        
        for dataset_id, info in dataset_info.items():
            overlap = len(domain_tags & info['tags'])
            logger.info(f"Unused domain '{domain}' vs Dataset {dataset_id}: overlap = {overlap}")
            if overlap > max_overlap:
                max_overlap = overlap
                best_dataset = dataset_id
        
        if not best_dataset:
            best_dataset = random.choice(list(dataset_info.keys()))
            logger.info(f"No overlap for domain '{domain}', assigning to random dataset {best_dataset}")
        
        try:
            meta_path = os.path.join(input_dir, best_dataset, "metadata.json")
            with open(meta_path, 'r+', encoding='utf-8') as f:
                metadata = json.load(f)
                previous_domain = metadata.get('domain', 'None')
                metadata['domain'] = domain
                f.seek(0)
                json.dump(metadata, f, indent=2, ensure_ascii=False)
                f.truncate()
                logger.info(f"Assigned unused domain '{domain}' to dataset {best_dataset} (overlap: {max_overlap}, previous: '{previous_domain}')")
                
                assignment_report.append({
                    'domain': domain,
                    'dataset': best_dataset,
                    'overlap': max_overlap
                })
        except Exception as e:
            logger.error(f"Failed to assign domain '{domain}' to dataset {best_dataset}: {str(e)}")
    
    if assignment_report:
        logger.info("Assignments for unused domains:")
        for assignment in assignment_report:
            logger.info(f"- Domain '{assignment['domain']}' assigned to dataset '{assignment['dataset']}' (overlap: {assignment['overlap']})")

def update_metadata(final_domains: Dict):
    """Обновление метаданных датасетов на основе разметки доменов"""
    logger.info(f"Starting metadata update with {len(final_domains)} domains: {list(final_domains.keys())}")
    
    normalized_domains = {}
    for domain, tags in final_domains.items():
        unique_tags = set()
        for t in tags:
            normalized = normalize_tag(t, apply_filtering=False)
            if normalized:
                unique_tags.add(normalized)
        normalized_domains[domain] = list(unique_tags)
        logger.info(f"Domain '{domain}': {len(unique_tags)} unique tags: {unique_tags}")
    
    updated_datasets = 0
    for dataset_id in tqdm(os.listdir(INPUT_DIR), desc="Updating Metadata"):
        meta_path = os.path.join(INPUT_DIR, dataset_id, "metadata.json")
        if not os.path.exists(meta_path):
            logger.warning(f"Metadata file not found for dataset {dataset_id}")
            continue
            
        try:
            with open(meta_path, 'r+', encoding='utf-8') as f:
                metadata = json.load(f)
                
                raw_tags = metadata.get('tags', [])
                filtered_tags = filter_tags(raw_tags)
                logger.info(f"Dataset {dataset_id}: {len(raw_tags)} raw tags: {raw_tags}")
                logger.info(f"Dataset {dataset_id}: {len(filtered_tags)} filtered tags: {filtered_tags}")
                
                best_domain = None
                max_overlap = 0
                
                for domain, domain_tags in normalized_domains.items():
                    overlap = len(set(filtered_tags) & set(domain_tags))
                    logger.info(f"Dataset {dataset_id} vs Domain '{domain}': overlap = {overlap}")
                    if overlap > max_overlap:
                        max_overlap = overlap
                        best_domain = domain
                
                previous_domain = metadata.get('domain', 'None')
                metadata['domain'] = best_domain if best_domain and max_overlap > 0 else "Other"
                metadata['cleaned_tags'] = filtered_tags
                
                logger.info(f"Dataset {dataset_id}: assigned domain '{metadata['domain']}' (previous: '{previous_domain}', overlap: {max_overlap})")
                
                f.seek(0)
                json.dump(metadata, f, indent=2, ensure_ascii=False)
                f.truncate()
                updated_datasets += 1
                
        except Exception as e:
            logger.error(f"Failed to update metadata for dataset {dataset_id}: {str(e)}")
            continue
    
    logger.info(f"Updated metadata for {updated_datasets} datasets")
    
    assign_unused_domains(final_domains)

def calculate_domain_distribution(annotated_domains: Dict[str, List[str]], input_dir: str = INPUT_DIR) -> tuple:
    """
    Рассчёт распределения датасетов по доменам.

    Args:
        annotated_domains: Словарь с тегами для каждого домена.
        input_dir: Директория с датасетами.

    Returns:
        tuple: (распределение в виде строки, путь к файлу статистики, словарь распределения)
    """
    domain_count = defaultdict(int)
    
    for dataset_id in tqdm(os.listdir(input_dir), desc="Calculating Distribution"):
        meta_path = os.path.join(input_dir, dataset_id, "metadata.json")
        if not os.path.exists(meta_path):
            continue
            
        with open(meta_path, 'r', encoding='utf-8') as f:
            metadata = json.load(f)
            domain = metadata.get('domain', 'Unknown')
            domain_count[domain] += 1
    
    sorted_distribution = sorted(
        domain_count.items(), 
        key=lambda x: x[1], 
        reverse=True
    )
    
    distribution_str = "\n Распределение датасетов по доменам:\n"
    distribution_str += "-" * 50 + "\n"
    for domain, count in sorted_distribution:
        distribution_str += f"{domain}: {count} датасетов\n"
    
    stats_file = os.path.join(OUTPUT_DIR, "domain_distribution.txt")
    with open(stats_file, 'w', encoding='utf-8') as f:
        f.write(distribution_str)
    
    return distribution_str, stats_file, dict(domain_count)

def select_best_result(all_results: Dict) -> tuple:
    """Выбор лучшего результата по метрике силуэта"""
    best_score = -1
    best_emb = None
    best_clust = None
    
    for emb_method, results in all_results.items():
        for clust_method, metrics in results.items():
            if 'silhouette' in metrics and metrics['silhouette'] > best_score:
                best_score = metrics['silhouette']
                best_emb = emb_method
                best_clust = clust_method
    
    return best_emb, best_clust

def run_clustering(embedding_method=None, clustering_method=None):
    """Основная функция для запуска кластеризации с объединением маленьких кластеров"""
    MIN_CLUSTER_SIZE = 3
    MERGE_THRESHOLD = 0.6

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    tags = load_and_filter_tags()
    
    if len(tags) < 2:
        logger.error("Not enough data for clustering")
        return None

    all_results = {}
    cluster_files = {}

    if embedding_method and clustering_method:
        emb_methods = [embedding_method]
        clust_methods = [clustering_method]
        user_selected = True
    else:
        emb_methods = EMBEDDING_METHODS.keys()
        clust_methods = CLUSTERING_METHODS.keys()
        user_selected = False

    for emb_method in emb_methods:
        logger.info(f"\n{'='*50}")
        logger.info(f"Processing embeddings: {emb_method.upper()}")
        
        try:
            analyzer = ClusterAnalyzer(tags, emb_method)
            analyzer.generate_embeddings()
        except Exception as e:
            logger.error(f"Embedding failed: {str(e)}")
            continue
        
        results = {}
        for clust_method in clust_methods:
            try:
                result = analyzer.cluster_tags(clust_method, MIN_CLUSTER_SIZE, MERGE_THRESHOLD)
                results[clust_method] = result["metrics"]
                
                cluster_file = f"{OUTPUT_DIR}/{emb_method}_{clust_method}_clusters.json"
                with open(cluster_file, 'w') as f:
                    json.dump(result["clusters"], f, indent=2)
                
                cluster_files[(emb_method, clust_method)] = cluster_file

            except Exception as e:
                logger.error(f"Clustering failed: {str(e)}")
                continue
        
        if results:
            all_results[emb_method] = results
            logger.info(f"Results for {emb_method}: {results}")

    if user_selected:
        best_emb = embedding_method
        best_clust = clustering_method
        logger.info(f"Using selected method: {best_emb} + {best_clust}")
    else:
        best_emb, best_clust = select_best_result(all_results)
        if not best_emb:
            logger.error("No valid clustering results found")
            return None
        logger.info(f"Best method: {best_emb} + {best_clust}")
    
    cluster_file = cluster_files.get((best_emb, best_clust))
    if not cluster_file or not os.path.exists(cluster_file):
        logger.error("Cluster file not found")
        return None
    
    with open(cluster_file, 'r') as f:
        clusters = json.load(f)
    
    return clusters

def save_domain_assignments(annotated_domains: Dict[str, List[str]]):
    """Сохранение разметки доменов и обновление метаданных"""
    output_file = DOMAIN_MAPPING_FILE
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(annotated_domains, f, indent=2, ensure_ascii=False)
    logger.info(f"Domain assignments saved to {os.path.abspath(output_file)}")
    
    update_metadata(annotated_domains)

In [None]:
def create_clustering_interface():
    """Создание интерактивного интерфейса для кластеризации и разметки"""
    embedding_selector = widgets.Dropdown(
        options=list(EMBEDDING_METHODS.keys()),
        value='sentence_transformers',
        description='Embedding:',
        style={'description_width': 'initial'}
    )

    clustering_selector = widgets.Dropdown(
        options=list(CLUSTERING_METHODS.keys()),
        value='hdbscan',
        description='Clustering:',
        style={'description_width': 'initial'}
    )

    min_datasets_input = widgets.IntText(
        value=5,
        description='Мин. датасетов:',
        style={'description_width': 'initial'},
        layout={'width': '200px'}
    )

    max_new_datasets_per_domain_input = widgets.IntText(
        value=10,
        description='Макс. новых на домен:',
        style={'description_width': 'initial'},
        layout={'width': '200px'}
    )

    run_button = widgets.Button(
        description="Запустить кластеризацию",
        button_style='success',
        tooltip='Запустить процесс кластеризации'
    )
    
    annotate_button = widgets.Button(
        description="Начать разметку",
        button_style='primary',
        disabled=True,
        tooltip='Начать ручную разметку доменов'
    )
    
    save_button = widgets.Button(
        description="Сохранить результаты",
        button_style='info',
        disabled=True,
        tooltip='Сохранить разметку и обновить метаданные'
    )
    
    enrich_button = widgets.Button(
        description="Пополнить домены",
        button_style='warning',
        disabled=True,
        tooltip='Добавить датасеты в маленькие домены'
    )
    
    domain_selector = widgets.SelectMultiple(
        options=[],
        description='Домен для пополнения:',
        style={'description_width': 'initial'},
        disabled=True,
        layout={'height': '100px'}
    )

    output_area = widgets.Output()
    results_output = widgets.Output()
    
    current_clusters = None
    annotated_domains = None

    def on_run_button_clicked(b):
        nonlocal current_clusters
        with output_area:
            output_area.clear_output()
            print("Запуск кластеризации...")
            current_clusters = run_clustering(
                embedding_selector.value,
                clustering_selector.value
            )
            if current_clusters:
                print(f"Кластеризация завершена! Получено {len(current_clusters)} кластеров.")
                annotate_button.disabled = False
                save_button.disabled = True
                enrich_button.disabled = True
                domain_selector.disabled = True
            else:
                print("Ошибка кластеризации. Проверьте логи.")

    def on_annotate_button_clicked(b):
        nonlocal annotated_domains
        with output_area:
            output_area.clear_output()
            if current_clusters:
                print("Начало ручной разметки...")
                
                def on_annotation_complete(result):
                    nonlocal annotated_domains
                    annotated_domains = result
                    print(f"\nРазметка завершена! Создано {len(annotated_domains)} доменов.")
                    
                    with results_output:
                        results_output.clear_output()
                        print("\nСозданные домены:")
                        for domain, tags in annotated_domains.items():
                            print(f"\n{domain} ({len(tags)} тегов):")
                            print(", ".join(tags[:5]) + ("..." if len(tags) > 5 else ""))
                    
                    save_button.disabled = False
                    enrich_button.disabled = False
                    domain_selector.options = list(annotated_domains.keys())
                    domain_selector.disabled = False
                
                manual_annotate_clusters(current_clusters, on_annotation_complete)
            else:
                print("Сначала выполните кластеризацию!")

    def on_save_button_clicked(b):
        with output_area:
            output_area.clear_output()
            if annotated_domains:
                print("Сохранение результатов...")
                save_domain_assignments(annotated_domains)
                print("Результаты сохранены!")
                distribution, stats_file, domain_count = calculate_domain_distribution(annotated_domains)
                print(distribution)
                print(f"\nСтатистика сохранена в {stats_file}")
            else:
                print("Сначала выполните разметку!")

    def on_enrich_button_clicked(b):
        with output_area:
            output_area.clear_output()
            if annotated_domains:
                selected_domains = list(domain_selector.value)
                if not selected_domains:
                    selected_domains = list(annotated_domains.keys())
                print(f"Пополнение доменов: {', '.join(selected_domains)}")
                distribution, stats_file, domain_count = calculate_domain_distribution(annotated_domains)
                enrich_small_domains(
                    domain_count,  
                    annotated_domains,
                    min_datasets=min_datasets_input.value,
                    max_new_datasets_per_domain=max_new_datasets_per_domain_input.value,
                    output_dir=OUTPUT_DIR_DATASETS
                )
                print("Пополнение завершено!")
                distribution, stats_file, domain_count = calculate_domain_distribution(annotated_domains)
                print(distribution)
                print(f"\nСтатистика сохранена в {stats_file}")
            else:
                print("Сначала выполните разметку!")

    run_button.on_click(on_run_button_clicked)
    annotate_button.on_click(on_annotate_button_clicked)
    save_button.on_click(on_save_button_clicked)
    enrich_button.on_click(on_enrich_button_clicked)

    display(widgets.VBox([
        embedding_selector,
        clustering_selector,
        min_datasets_input,
        max_new_datasets_per_domain_input,
        widgets.HBox([run_button, annotate_button, save_button, enrich_button]),
        domain_selector,
        output_area,
        results_output
    ]))

def run_domain_mapping():
    """Запуск всего пайплайна создания доменов и разметки датасетов"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print("Используйте элементы управления ниже")
    create_clustering_interface()