In [2]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Protocol, Tuple

import numpy as np
import pandas as pd
import polars as pl

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import average_precision_score
from sklearn.model_selection import GroupShuffleSplit, train_test_split

In [3]:
class _SingletonBase:
    _instances: Dict[type, Any] = {}

    def __new__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super().__new__(cls)
        return cls._instances[cls]


class Logger(_SingletonBase):
    def info(self, msg: str) -> None:
        print(f"[INFO] {msg}")

    def warn(self, msg: str) -> None:
        print(f"[WARN] {msg}")


@dataclass
class Config(_SingletonBase):
    DATA_DIR: Path = Path("data/competition")
    submission_name: str = "submission.csv"

    target_col: str = "target"
    row_id_col: str = "id"
    left_id_col: str = "id1"
    right_id_col: str = "id2"

    cat_features: Tuple[str, ...] = ("parentname1", "parentname2", "subjectname1", "subjectname2")
    text_features: Tuple[str, ...] = ("title1", "title2", "description1", "description2")

    drop_cols: Tuple[str, ...] = ("characteristics1", "characteristics2")
    test_size: float = 0.2
    random_seed: int = 42

    split_strategy: str = "group_min_id"

    # CatBoost
    task_type: str = "GPU"
    verbose: int = 100
    eval_metric: str = "PRAUC"
    early_stopping_rounds: int = 100


In [None]:
class DataFactory(Protocol):
    def load_train(self) -> pd.DataFrame: ...
    def load_test(self) -> pd.DataFrame: ...
    def build_Xy(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[pd.Series]]: ...
    def build_submission(self, test_df: pd.DataFrame, y_pred: np.ndarray) -> pd.DataFrame: ...


class BeautyPairsFactory:
    """
    Семейство для конкретного датасета beauty_*:
    - читаем parquet через polars scan (как у тебя)
    - дропаем characteristics* (бейзлайн)
    - делаем X, y
    """

    def __init__(self, cfg: Config, log: Logger):
        self.cfg = cfg
        self.log = log

    def _read_parquet(self, path: Path) -> pl.LazyFrame:
        if not path.exists():
            raise FileNotFoundError(f"Parquet not found: {path}")
        return pl.scan_parquet(path)

    def load_train(self) -> pd.DataFrame:
        path = self.cfg.DATA_DIR / "beauty_train.parquet"
        lf = self._read_parquet(path)
        n = lf.select(pl.len()).collect().item()
        self.log.info(f"Train rows: {n}")
        df = lf.drop(list(self.cfg.drop_cols)).collect().to_pandas()
        return df

    def load_test(self) -> pd.DataFrame:
        path = self.cfg.DATA_DIR / "beauty_test.parquet"
        lf = self._read_parquet(path)
        n = lf.select(pl.len()).collect().item()
        self.log.info(f"Test rows: {n}")
        df = lf.drop(list(self.cfg.drop_cols)).collect().to_pandas()
        return df

    def build_Xy(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        drop = [self.cfg.row_id_col, self.cfg.left_id_col, self.cfg.right_id_col]
        y = None
        if self.cfg.target_col in df.columns:
            y = df[self.cfg.target_col].astype(int)
            drop = drop + [self.cfg.target_col]

        X = df.drop(columns=drop)
        return X, y

    def build_submission(self, test_df: pd.DataFrame, y_pred: np.ndarray) -> pd.DataFrame:
        sub = pd.DataFrame({self.cfg.row_id_col: test_df[self.cfg.row_id_col].values, "y_pred": y_pred})
        return sub

In [None]:
class ModelCreator(Protocol):
    def factory_method(self) -> CatBoostClassifier: ...


class CatBoostPRAUCCreator:
    def __init__(self, cfg: Config):
        self.cfg = cfg

    def factory_method(self) -> CatBoostClassifier:
        return CatBoostClassifier(
            verbose=self.cfg.verbose,
            eval_metric=self.cfg.eval_metric,
            task_type=self.cfg.task_type,
            random_seed=self.cfg.random_seed,
            loss_function="Logloss",
            iterations=5000,
            learning_rate=0.05,
            depth=8,
        )

In [None]:
class SplitStrategy(Protocol):
    def split(self, df: pd.DataFrame, cfg: Config) -> Tuple[pd.DataFrame, pd.DataFrame]: ...


class RandomStratifiedSplit:
    def split(self, df: pd.DataFrame, cfg: Config) -> Tuple[pd.DataFrame, pd.DataFrame]:
        X_idx = np.arange(len(df))
        y = df[cfg.target_col].astype(int).values
        tr, va = train_test_split(
            X_idx,
            test_size=cfg.test_size,
            random_state=cfg.random_seed,
            stratify=y,
        )
        return df.iloc[tr].copy(), df.iloc[va].copy()


class GroupSplitById1:
    def split(self, df: pd.DataFrame, cfg: Config) -> Tuple[pd.DataFrame, pd.DataFrame]:
        splitter = GroupShuffleSplit(n_splits=1, test_size=cfg.test_size, random_state=cfg.random_seed)
        groups = df[cfg.left_id_col].values
        tr, va = next(splitter.split(df, groups=groups))
        return df.iloc[tr].copy(), df.iloc[va].copy()


class GroupSplitByMinId:
    def split(self, df: pd.DataFrame, cfg: Config) -> Tuple[pd.DataFrame, pd.DataFrame]:
        g = np.minimum(df[cfg.left_id_col].astype(str).values, df[cfg.right_id_col].astype(str).values)
        splitter = GroupShuffleSplit(n_splits=1, test_size=cfg.test_size, random_state=cfg.random_seed)
        tr, va = next(splitter.split(df, groups=g))
        return df.iloc[tr].copy(), df.iloc[va].copy()


def make_split_strategy(cfg: Config) -> SplitStrategy:
    if cfg.split_strategy == "random":
        return RandomStratifiedSplit()
    if cfg.split_strategy == "group_id1":
        return GroupSplitById1()
    if cfg.split_strategy == "group_min_id":
        return GroupSplitByMinId()
    raise ValueError("split_strategy must be: random | group_id1 | group_min_id")


class Trainer:
    def __init__(self, cfg: Config, log: Logger, data_factory: DataFactory, model: CatBoostClassifier):
        self.cfg = cfg
        self.log = log
        self.data_factory = data_factory
        self.model = model

    def _make_pool(self, X: pd.DataFrame, y: Optional[pd.Series]) -> Pool:
        return Pool(
            X,
            y,
            cat_features=list(self.cfg.cat_features),
            text_features=list(self.cfg.text_features),
        )

    def fit_validate(self, train_df: pd.DataFrame) -> float:
        splitter = make_split_strategy(self.cfg)
        df_tr, df_va = splitter.split(train_df, self.cfg)

        X_tr, y_tr = self.data_factory.build_Xy(df_tr)
        X_va, y_va = self.data_factory.build_Xy(df_va)

        train_pool = self._make_pool(X_tr, y_tr)
        valid_pool = self._make_pool(X_va, y_va)

        self.log.info(f"Fitting: train={len(df_tr)} valid={len(df_va)} split={self.cfg.split_strategy}")
        self.model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=self.cfg.early_stopping_rounds)

        va_pred = self.model.predict_proba(valid_pool)[:, 1]
        ap = float(average_precision_score(y_va.values, va_pred))
        baseline = float(y_va.mean())
        self.log.info(f"Valid PR-AUC(AP)={ap:.6f} baseline≈{baseline:.6f}")
        return ap

    def fit_full(self, train_df: pd.DataFrame) -> None:
        X, y = self.data_factory.build_Xy(train_df)
        pool = self._make_pool(X, y)
        self.log.info(f"Fitting on full train: n={len(train_df)}")
        self.model.fit(pool)

    def predict_test(self, test_df: pd.DataFrame) -> np.ndarray:
        X_test, _ = self.data_factory.build_Xy(test_df)
        pool = self._make_pool(X_test, None)
        pred = self.model.predict_proba(pool)[:, 1]
        return pred


class ExperimentBuilder:
    def __init__(self, cfg: Config, log: Logger):
        self.cfg = cfg
        self.log = log
        self._data_factory: Optional[DataFactory] = None
        self._model_creator: Optional[ModelCreator] = None

    def with_data_factory(self, factory: DataFactory) -> "ExperimentBuilder":
        self._data_factory = factory
        return self

    def with_model_creator(self, creator: ModelCreator) -> "ExperimentBuilder":
        self._model_creator = creator
        return self

    def build(self) -> Trainer:
        if self._data_factory is None or self._model_creator is None:
            raise RuntimeError("ExperimentBuilder requires data_factory and model_creator")
        model = self._model_creator.factory_method()
        return Trainer(self.cfg, self.log, self._data_factory, model)



In [None]:
def main() -> None:
    cfg = Config()
    log = Logger()

    data_factory = BeautyPairsFactory(cfg, log)
    model_creator = CatBoostPRAUCCreator(cfg)

    trainer = (
        ExperimentBuilder(cfg, log)
        .with_data_factory(data_factory)
        .with_model_creator(model_creator)
        .build()
    )

    # Load data
    train_df = data_factory.load_train()
    test_df = data_factory.load_test()

    # Validate
    trainer.fit_validate(train_df)

    # Fit full + predict
    trainer.fit_full(train_df)
    test_pred = trainer.predict_test(test_df)

    # Save subm
    sub = data_factory.build_submission(test_df, test_pred)
    out_path = cfg.DATA_DIR / cfg.submission_name
    sub.to_csv(out_path, index=False)
    log.info(f"Saved submission: {out_path} (rows={len(sub)})")


if __name__ == "__main__":
    main()

FileNotFoundError: Parquet not found: data\competition\beauty_train.parquet