<a href="https://colab.research.google.com/github/Mihail-Chr/projects/blob/main/ozon/opimize_pipline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U pip
#!pip install --use-pep517 suod
!pip install "polars>=1.25,<1.29"
!pip install catboost yake threadpoolctl
!pip install polars-splitters


In [None]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import LabelEncoder
import gc
import warnings
warnings.filterwarnings('ignore')

try:
    import cudf
    import cuml
    GPU_AVAILABLE = True
except ImportError:
    GPU_AVAILABLE = False

RANDOM_STATE = 255
np.random.seed(RANDOM_STATE)
from google.colab import drive
drive.mount('/content/drive')

## –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è

In [None]:
class OptimizedTextAwarePipeline:

    def __init__(self, target='resolution'):
        self.target = target
        self.random_state = RANDOM_STATE
        self.col_drop = ['id', 'ItemID', 'SellerID']
        self.hard_threshold_low = 0.2
        self.hard_threshold_high = 0.8

        self.text_columns = []
        self.models = {}
        self.features = {}
        self.label_encoders = {}  # –î–ª—è –∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

        # GPU detection
        self.gpu_available = GPU_AVAILABLE
        self.device_type = "GPU" if self.gpu_available else "CPU"

        # –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∏ —Ç–µ–∫—Å—Ç–∞ –¥–ª—è CatBoost
        self.text_processing_params = {
            'tokenizers': [{'tokenizer_id': 'Space', 'separator_type': 'ByDelimiter', 'delimiter': ' '}],
            'dictionaries': [{'dictionary_id': 'Word', 'max_dictionary_size': 50000}],
            'feature_calcers': [{'calcer_id': 'BoW', 'top_tokens_count': 1000}]
        }

        # –û–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã CatBoost
        self.base_catboost_params = {
            'random_seed': self.random_state,
            'task_type': self.device_type,
            'verbose': False,
            'eval_metric': 'F1',
            'loss_function': 'Logloss',
            'use_best_model': True,
            'early_stopping_rounds': 30,
            'thread_count': -1,
            'allow_writing_files': False,
            'text_processing': self.text_processing_params
        }

        if self.gpu_available:
            self.base_catboost_params.update({
                'gpu_ram_part': 0.8,
                'used_ram_limit': '8gb'
            })

    def identify_text_columns(self, df: pl.DataFrame):
        """–û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ç–µ–∫—Å—Ç–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏"""
        self.text_columns = [col for col, dtype in df.schema.items()
                             if dtype == pl.String and col not in self.col_drop + [self.target]]
        print(f"–ù–∞–π–¥–µ–Ω–æ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫: {len(self.text_columns)}")
        return self.text_columns


## –ì–µ–Ω–µ—Ä–∞—Ü–∏—è continuous/categorical –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

In [None]:
def advanced_feature_engineering(self, df: pl.DataFrame, stage: str) -> pl.DataFrame:

    print(f"Feature engineering –¥–ª—è —Å—Ç–∞–¥–∏–∏ {stage}...")

    # –ü–æ–ª—É—á–∞–µ–º —á–∏—Å–ª–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏
    numeric_cols = []
    for col, dtype in df.schema.items():
        if col not in self.col_drop + [self.target] + self.text_columns:
            if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
                numeric_cols.append(col)

    # –ü–†–ê–í–ò–õ–¨–ù–û: –†–∞–∑–¥–µ–ª—è–µ–º –Ω–∞ continuous –∏ categorical
    continuous_cols = []
    categorical_cols = []

    for col in numeric_cols:
        if col in df.columns:
            unique_count = df[col].n_unique()
            if unique_count <= 50:  # –ö–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ
                categorical_cols.append(col)
            else:  # –ù–µ–ø—Ä–µ—Ä—ã–≤–Ω—ã–µ
                continuous_cols.append(col)

    print(f"Continuous: {len(continuous_cols)}, Categorical: {len(categorical_cols)}")

    expressions = []

    # 1. –¢—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏–∏ –¢–û–õ–¨–ö–û –¥–ª—è continuous –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
    for col in continuous_cols[:15]:
        if col in df.columns:
            expressions.extend([
                pl.when(pl.col(col) > 0).then(pl.col(col).log()).otherwise(0).alias(f"{col}_log"),
                pl.when(pl.col(col) >= 0).then(pl.col(col).sqrt()).otherwise(0).alias(f"{col}_sqrt"),
                (pl.col(col) ** 2).alias(f"{col}_sq"),
                pl.when(pl.col(col) != 0).then(1.0 / pl.col(col)).otherwise(0).alias(f"{col}_inv")
            ])

    # 2. –ü–∞—Ä–Ω—ã–µ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏—è –¥–ª—è continuous
    if len(continuous_cols) >= 2:
        top_continuous = continuous_cols[:6]  # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –¥–ª—è –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç–∏
        for i in range(len(top_continuous)):
            for j in range(i+1, len(top_continuous)):
                col1, col2 = top_continuous[i], top_continuous[j]
                if col1 in df.columns and col2 in df.columns:
                    expressions.extend([
                        pl.when(pl.col(col2) != 0).then(pl.col(col1) / pl.col(col2)).otherwise(0).alias(f"{col1}_{col2}_ratio"),
                        (pl.col(col1) * pl.col(col2)).alias(f"{col1}_{col2}_prod"),
                        (pl.col(col1) - pl.col(col2)).alias(f"{col1}_{col2}_diff")
                    ])

    # 3. Percentile-based features –¥–ª—è continuous
    for col in continuous_cols[:10]:
        if col in df.columns:
            q75 = df[col].quantile(0.75)
            q25 = df[col].quantile(0.25)
            median = df[col].median()

            expressions.extend([
                (pl.col(col) > q75).cast(pl.Int8).alias(f"{col}_q75_flag"),
                (pl.col(col) < q25).cast(pl.Int8).alias(f"{col}_q25_flag"),
                (pl.col(col) > median).cast(pl.Int8).alias(f"{col}_median_flag")
            ])

    # –ü—Ä–∏–º–µ–Ω—è–µ–º —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏–∏ –±–∞—Ç—á–∞–º–∏
    if expressions:
        batch_size = 50
        for i in range(0, len(expressions), batch_size):
            batch = expressions[i:i+batch_size]
            try:
                df = df.with_columns(batch)
            except Exception as e:
                print(f"–û—à–∏–±–∫–∞ –≤ batch {i}: {e}")
                continue

    print(f"–°–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–æ {len(expressions)} –Ω–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è {stage}")
    return df

OptimizedTextAwarePipeline.advanced_feature_engineering = advanced_feature_engineering

## –£–ª—É—á—à–µ–Ω–Ω—ã–π –æ—Ç–±–æ—Ä –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

In [None]:
def improved_feature_selection(self, df_pandas: pd.DataFrame, stage: str, max_features: int = 30) -> list:

    print(f"Feature selection –¥–ª—è {stage} (–º–∞–∫—Å. {max_features} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤)...")

    # –ü–æ–ª—É—á–∞–µ–º –≤—Å–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –∫—Ä–æ–º–µ —Å–ª—É–∂–µ–±–Ω—ã—Ö
    feature_cols = [col for col in df_pandas.columns
                   if col not in self.col_drop + [self.target]]

    if len(feature_cols) < 2:
        return feature_cols

    # –†–∞–∑–¥–µ–ª—è–µ–º –Ω–∞ —á–∏—Å–ª–æ–≤—ã–µ –∏ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ
    numeric_features = []
    text_features = []

    for col in feature_cols:
        if col in self.text_columns:
            text_features.append(col)
        else:
            try:
                # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —è–≤–ª—è–µ—Ç—Å—è –ª–∏ –∫–æ–ª–æ–Ω–∫–∞ —á–∏—Å–ª–æ–≤–æ–π
                pd.to_numeric(df_pandas[col], errors='raise')
                numeric_features.append(col)
            except (ValueError, TypeError):
                # –ï—Å–ª–∏ –Ω–µ —á–∏—Å–ª–æ–≤–∞—è, –∫–æ–¥–∏—Ä—É–µ–º –∫–∞–∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—É—é
                if col not in self.label_encoders:
                    self.label_encoders[col] = LabelEncoder()
                    df_pandas[col] = self.label_encoders[col].fit_transform(df_pandas[col].fillna('missing'))
                numeric_features.append(col)

    selected_features = []

    # Mutual information –¥–ª—è —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
    if numeric_features and len(numeric_features) > 0:
        X_numeric = df_pandas[numeric_features].fillna(0)
        y = df_pandas[self.target]

        try:
            mi_selector = SelectKBest(mutual_info_classif, k=min(max_features-5, len(numeric_features)))
            mi_selector.fit(X_numeric, y)
            selected_numeric = [numeric_features[i] for i in mi_selector.get_support(indices=True)]
            selected_features.extend(selected_numeric)
        except Exception as e:
            print(f"MI selection failed: {e}, using correlation")
            # Fallback –Ω–∞ –∫–æ—Ä—Ä–µ–ª—è—Ü–∏—é
            corr_with_target = X_numeric.corrwith(y).abs().sort_values(ascending=False)
            selected_features.extend(corr_with_target.head(max_features-5).index.tolist())

    # –î–æ–±–∞–≤–ª—è–µ–º –≤–∞–∂–Ω—ã–µ —Ç–µ–∫—Å—Ç–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
    selected_features.extend(text_features[:5])

    # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –¥–æ max_features
    selected_features = selected_features[:max_features]

    print(f"–í—ã–±—Ä–∞–Ω–æ {len(selected_features)} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")
    return selected_features

OptimizedTextAwarePipeline.improved_feature_selection = improved_feature_selection

## –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ —Å—Ç–∞–¥–∏–∏

In [None]:
def visualize_stage_results(self, model, X, y, features, stage_name):

    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
    preds = model.predict(X)
    probas = model.predict_proba(X)[:, 1]

    # –°–æ–∑–¥–∞–µ–º —Ñ–∏–≥—É—Ä—É —Å –ø–æ–¥–≥—Ä–∞—Ñ–∏–∫–∞–º–∏
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –º–æ–¥–µ–ª–∏ {stage_name}', fontsize=16, fontweight='bold')

    # 1. Confusion Matrix
    cm = confusion_matrix(y, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0,0])
    axes[0,0].set_title('Confusion Matrix')
    axes[0,0].set_xlabel('Predicted')
    axes[0,0].set_ylabel('Actual')

    # 2. Feature Importance
    if hasattr(model, 'get_feature_importance'):
        importances = model.get_feature_importance()
        feat_imp = pd.DataFrame({
            'feature': features,
            'importance': importances
        }).sort_values('importance', ascending=True).tail(10)

        axes[0,1].barh(feat_imp['feature'], feat_imp['importance'])
        axes[0,1].set_title('Top 10 Feature Importance')
        axes[0,1].set_xlabel('Importance')

    # 3. Probability Distribution
    axes[1,0].hist(probas[y==0], bins=50, alpha=0.7, label='Class 0', color='red')
    axes[1,0].hist(probas[y==1], bins=50, alpha=0.7, label='Class 1', color='green')
    axes[1,0].axvline(self.hard_threshold_low, color='blue', linestyle='--', label=f'Low threshold ({self.hard_threshold_low})')
    axes[1,0].axvline(self.hard_threshold_high, color='blue', linestyle='--', label=f'High threshold ({self.hard_threshold_high})')
    axes[1,0].set_xlabel('Predicted Probability')
    axes[1,0].set_ylabel('Count')
    axes[1,0].set_title('Probability Distribution')
    axes[1,0].legend()

    # 4. Classification Report (—Ç–µ–∫—Å—Ç)
    cr = classification_report(y, preds, output_dict=True)
    report_text = f"""Classification Report:

            Precision (Class 0): {cr['0']['precision']:.3f}
            Recall (Class 0): {cr['0']['recall']:.3f}
            F1-Score (Class 0): {cr['0']['f1-score']:.3f}

            Precision (Class 1): {cr['1']['precision']:.3f}
            Recall (Class 1): {cr['1']['recall']:.3f}
            F1-Score (Class 1): {cr['1']['f1-score']:.3f}

            Overall F1-Score: {cr['macro avg']['f1-score']:.3f}
            Accuracy: {cr['accuracy']:.3f}"""

    axes[1,1].text(0.1, 0.5, report_text, fontsize=10, verticalalignment='center',
                  transform=axes[1,1].transAxes, family='monospace')
    axes[1,1].set_title('Performance Metrics')
    axes[1,1].axis('off')

    plt.tight_layout()
    plt.show()

OptimizedTextAwarePipeline.visualize_stage_results = visualize_stage_results

## –ó–∞–ø–∏—Å—å —É–≤–µ—Ä–µ–Ω–Ω—ã—Ö –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π —Å –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ–π –∏–Ω–¥–µ–∫—Å–∞—Ü–∏–µ–π

In [None]:
def filter_confident_predictions(self, model, X, y, df_original, stage_name):
    """"""
    print(f"–§–∏–ª—å—Ç—Ä–∞—Ü–∏—è confident predictions –¥–ª—è {stage_name}...")

    probas = model.predict_proba(X)[:, 1]
    preds = model.predict(X)

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º confident –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
    low_confident = (probas < self.hard_threshold_low) & (preds == 0) & (y == 0)
    high_confident = (probas > self.hard_threshold_high) & (preds == 1) & (y == 1)
    confident_mask = low_confident | high_confident

    # –ü–æ–ª—É—á–∞–µ–º ID –∏–∑ –∏—Å—Ö–æ–¥–Ω–æ–≥–æ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞
    all_ids = df_original['id'].to_list()

    # –†–∞–∑–¥–µ–ª—è–µ–º –Ω–∞ confident –∏ remaining
    confident_ids = [all_ids[i] for i in range(len(confident_mask)) if confident_mask[i]]
    remaining_ids = [all_ids[i] for i in range(len(confident_mask)) if not confident_mask[i]]

    df_confident = df_original.filter(pl.col('id').is_in(confident_ids))
    df_remaining = df_original.filter(pl.col('id').is_in(remaining_ids))

    print(f"Confident: {len(confident_ids)}, Remaining: {len(remaining_ids)}")

    return df_confident, df_remaining

OptimizedTextAwarePipeline.filter_confident_predictions = filter_confident_predictions

## –û–±—É—á–µ–Ω–∏–µ —Å—Ç–∞–¥–∏–π —Å –≤–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏–µ–π

In [None]:
def train_stage(self, df: pl.DataFrame, stage_name: str, max_features: int):

    print(f"\n{'='*60}")
    print(f"–û–ë–£–ß–ï–ù–ò–ï –°–¢–ê–î–ò–ò {stage_name.upper()}")
    print(f"{'='*60}")
    print(f"–†–∞–∑–º–µ—Ä –≤—ã–±–æ—Ä–∫–∏: {len(df)}")

    # Feature engineering
    df_processed = self.advanced_feature_engineering(df, stage_name)
    df_pandas = df_processed.to_pandas()

    # Feature selection
    features = self.improved_feature_selection(df_pandas, stage_name, max_features)

    if len(features) == 0:
        print(f"–ù–µ—Ç –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è {stage_name}!")
        return None, df_pandas, features

    X = df_pandas[features].copy()
    y = df_pandas[self.target].copy()

    # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
    for col in features:
        if col in self.text_columns:
            X[col] = X[col].fillna('missing').astype(str)

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
    cat_features_idx = []
    text_features_idx = []

    for i, col in enumerate(features):
        if col in self.text_columns:
            text_features_idx.append(i)
        elif X[col].dtype == 'object' or (X[col].nunique() <= 50 and X[col].dtype in ['int64', 'int32']):
            cat_features_idx.append(i)

    print(f"–í—Å–µ–≥–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: {len(features)}")
    print(f"–ö–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö: {len(cat_features_idx)}")
    print(f"–¢–µ–∫—Å—Ç–æ–≤—ã—Ö: {len(text_features_idx)}")

    # –°–æ–∑–¥–∞–µ–º Pool
    train_pool = Pool(X, y, cat_features=cat_features_idx, text_features=text_features_idx)

    # –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –º–æ–¥–µ–ª–∏ –¥–ª—è —Ä–∞–∑–Ω—ã—Ö —Å—Ç–∞–¥–∏–π
    stage_params = self.base_catboost_params.copy()
    if stage_name == 'hard':
        stage_params.update({'iterations': 400, 'depth': 6, 'learning_rate': 0.1})
    elif stage_name == 'soft':
        stage_params.update({'iterations': 600, 'depth': 8, 'learning_rate': 0.08})
    else:  # error
        stage_params.update({'iterations': 800, 'depth': 10, 'learning_rate': 0.05})

    # –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
    model = CatBoostClassifier(**stage_params)
    model.fit(train_pool, verbose=False)

    # Cross-validation score
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=255)
    cv_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        temp_pool = Pool(X_train, y_train, cat_features=cat_features_idx, text_features=text_features_idx)
        temp_model = CatBoostClassifier(**stage_params)
        temp_model.fit(temp_pool, verbose=False)

        preds = temp_model.predict(X_val)
        f1 = f1_score(y_val, preds)
        cv_scores.append(f1)

    cv_f1 = np.mean(cv_scores)
    print(f"CV F1-score: {cv_f1:.4f} ¬± {np.std(cv_scores):.4f}")

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å –∏ –ø—Ä–∏–∑–Ω–∞–∫–∏
    self.models[stage_name] = model
    self.features[stage_name] = features

    # –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    self.visualize_stage_results(model, X, y, features, stage_name)

    return model, df_pandas, features

OptimizedTextAwarePipeline.train_stage = train_stage

In [None]:
## –û–±—â–∞—è –≤–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è –ø–∞–π–ø–ª–∞–π–Ω–∞

In [None]:
def visualize_pipeline_overview(self, stage_stats):

    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('–û–±–∑–æ—Ä –ø–∞–π–ø–ª–∞–π–Ω–∞ Text-Aware Classification', fontsize=16, fontweight='bold')

    # 1. –†–∞–∑–º–µ—Ä –≤—ã–±–æ—Ä–∫–∏ –ø–æ —Å—Ç–∞–¥–∏—è–º
    stages = list(stage_stats.keys())
    sample_sizes = [stage_stats[stage]['sample_size'] for stage in stages]

    axes[0,0].bar(stages, sample_sizes, color=['#ff7f0e', '#2ca02c', '#d62728'])
    axes[0,0].set_title('–†–∞–∑–º–µ—Ä –≤—ã–±–æ—Ä–∫–∏ –ø–æ —Å—Ç–∞–¥–∏—è–º')
    axes[0,0].set_ylabel('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –æ–±—Ä–∞–∑—Ü–æ–≤')
    for i, v in enumerate(sample_sizes):
        axes[0,0].text(i, v + max(sample_sizes)*0.01, str(v), ha='center', va='bottom')

    # 2. F1-score –ø–æ —Å—Ç–∞–¥–∏—è–º
    f1_scores = [stage_stats[stage]['f1_score'] for stage in stages if 'f1_score' in stage_stats[stage]]
    if f1_scores:
        axes[0,1].bar(stages[:len(f1_scores)], f1_scores, color=['#ff7f0e', '#2ca02c', '#d62728'])
        axes[0,1].set_title('F1-Score –ø–æ —Å—Ç–∞–¥–∏—è–º')
        axes[0,1].set_ylabel('F1-Score')
        axes[0,1].set_ylim(0, 1)
        for i, v in enumerate(f1_scores):
            axes[0,1].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

    # 3. –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ —Å—Ç–∞–¥–∏—è–º
    feature_counts = [stage_stats[stage]['feature_count'] for stage in stages if 'feature_count' in stage_stats[stage]]
    if feature_counts:
        axes[1,0].bar(stages[:len(feature_counts)], feature_counts, color=['#ff7f0e', '#2ca02c', '#d62728'])
        axes[1,0].set_title('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ —Å—Ç–∞–¥–∏—è–º')
        axes[1,0].set_ylabel('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤')
        for i, v in enumerate(feature_counts):
            axes[1,0].text(i, v + max(feature_counts)*0.01, str(v), ha='center', va='bottom')

    # 4. –û–±—â–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
    total_samples = sum(sample_sizes)
    processing_reduction = (sample_sizes[0] - sample_sizes[-1]) / sample_sizes[0] * 100 if len(sample_sizes) > 1 else 0

    stats_text = f"""–û–±—â–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–∞–π–ø–ª–∞–π–Ω–∞:

        –í—Å–µ–≥–æ –æ–±—Ä–∞–∑—Ü–æ–≤ –≤ –Ω–∞—á–∞–ª–µ: {sample_sizes[0]:,}
        –û–±—Ä–∞–∑—Ü–æ–≤ –Ω–∞ –ø–æ—Å–ª–µ–¥–Ω–µ–π —Å—Ç–∞–¥–∏–∏: {sample_sizes[-1]:,}
        –°–æ–∫—Ä–∞—â–µ–Ω–∏–µ –æ–±—ä–µ–º–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∏: {processing_reduction:.1f}%

        –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç–∞–¥–∏–π: {len(stages)}
        –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ GPU: {'–î–∞' if self.gpu_available else '–ù–µ—Ç'}
        –¢–µ–∫—Å—Ç–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: {len(self.text_columns)}

        –°—Ç—Ä–∞—Ç–µ–≥–∏—è: –ü–æ—ç—Ç–∞–ø–Ω–∞—è —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏—è
        confident predictions —Å —É–º–µ–Ω—å—à–µ–Ω–∏–µ–º
        —Å–ª–æ–∂–Ω–æ—Å—Ç–∏ –∑–∞–¥–∞—á–∏ –Ω–∞ –∫–∞–∂–¥–æ–π —Å—Ç–∞–¥–∏–∏"""

    axes[1,1].text(0.05, 0.5, stats_text, fontsize=10, verticalalignment='center',
                  transform=axes[1,1].transAxes, family='monospace')
    axes[1,1].set_title('–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–∞–π–ø–ª–∞–π–Ω–∞')
    axes[1,1].axis('off')

    plt.tight_layout()
    plt.show()

OptimizedTextAwarePipeline.visualize_pipeline_overview = visualize_pipeline_overview

## –ü–æ–ª–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ

In [None]:
def fit(self, train_df: pl.DataFrame):

    print("üöÄ –ó–ê–ü–£–°–ö –û–ü–¢–ò–ú–ò–ó–ò–†–û–í–ê–ù–ù–û–ì–û TEXT-AWARE PIPELINE")
    print("="*80)

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ç–µ–∫—Å—Ç–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏
    self.identify_text_columns(train_df)

    # –ó–∞–ø–æ–ª–Ω—è–µ–º –ø—Ä–æ–ø—É—Å–∫–∏
    expressions = []
    for col, dtype in train_df.schema.items():
        if col not in self.col_drop + [self.target]:
            if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
                expressions.append(pl.col(col).fill_null(0).alias(col))
            elif dtype == pl.String:
                expressions.append(pl.col(col).fill_null("missing").alias(col))

    if expressions:
        train_df = train_df.with_columns(expressions)

    current_df = train_df
    stages = [('hard', 25), ('soft', 35), ('error', 50)]
    stage_stats = {}

    for stage_name, max_features in stages:
        stage_stats[stage_name] = {'sample_size': len(current_df)}

        if len(current_df) < 50:  # –ú–∏–Ω–∏–º–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä –¥–ª—è –æ–±—É—á–µ–Ω–∏—è
            print(f"–ù–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è —Å—Ç–∞–¥–∏–∏ {stage_name}: {len(current_df)}")
            break

        # –û–±—É—á–µ–Ω–∏–µ —Å—Ç–∞–¥–∏–∏
        model, df_pandas, features = self.train_stage(current_df, stage_name, max_features)

        if model is None:
            break

        stage_stats[stage_name].update({
            'feature_count': len(features),
            'f1_score': np.mean([f1_score(df_pandas[self.target], model.predict(df_pandas[features])) for _ in [1]])
        })

        # –§–∏–ª—å—Ç—Ä–∞—Ü–∏—è confident predictions
        X = df_pandas[features].copy()
        y = df_pandas[self.target].copy()

        # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏
        for col in features:
            if col in self.text_columns:
                X[col] = X[col].fillna('missing').astype(str)

        confident, remaining = self.filter_confident_predictions(model, X, y, current_df, stage_name)

        current_df = remaining
        gc.collect()

        if len(current_df) == 0:
            print("–í—Å–µ –æ–±—Ä–∞–∑—Ü—ã –æ–±—Ä–∞–±–æ—Ç–∞–Ω—ã —É–≤–µ—Ä–µ–Ω–Ω—ã–º–∏ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è–º–∏!")
            break

    # –û–±—â–∞—è –≤–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è
    self.visualize_pipeline_overview(stage_stats)

    print("\n" + "="*80)
    print("‚úÖ –û–ë–£–ß–ï–ù–ò–ï –ü–ê–ô–ü–õ–ê–ô–ù–ê –ó–ê–í–ï–†–®–ï–ù–û!")
    print("="*80)

    return self

OptimizedTextAwarePipeline.fit = fit

## –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Å –ø–æ—ç—Ç–∞–ø–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–æ–π

In [None]:
def predict(self, test_df: pl.DataFrame):

    print("üîÆ –ù–ê–ß–ê–õ–û –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–Ø")
    print("="*50)

    # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ç–µ–∫—Å—Ç–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏
    self.identify_text_columns(test_df)

    # –ó–∞–ø–æ–ª–Ω—è–µ–º –ø—Ä–æ–ø—É—Å–∫–∏
    expressions = []
    for col, dtype in test_df.schema.items():
        if col not in self.col_drop:
            if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
                expressions.append(pl.col(col).fill_null(0).alias(col))
            elif dtype == pl.String:
                expressions.append(pl.col(col).fill_null("missing").alias(col))

    if expressions:
        test_df = test_df.with_columns(expressions)

    current_df = test_df
    all_predictions = {}

    for stage_name in ['hard', 'soft', 'error']:
        if stage_name not in self.models or len(current_df) == 0:
            continue

        print(f"–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –¥–ª—è —Å—Ç–∞–¥–∏–∏ {stage_name}: {len(current_df)} –æ–±—Ä–∞–∑—Ü–æ–≤")

        model = self.models[stage_name]
        features = self.features[stage_name]

        # Feature engineering
        df_processed = self.advanced_feature_engineering(current_df, stage_name)
        df_pandas = df_processed.to_pandas()

        # –ü—Ä–æ–≤–µ—Ä—è–µ–º –¥–æ—Å—Ç—É–ø–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        available_features = [f for f in features if f in df_pandas.columns]
        if len(available_features) == 0:
            print(f"–ù–µ—Ç –¥–æ—Å—Ç—É–ø–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è {stage_name}")
            continue

        X_test = df_pandas[available_features].copy()

        # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        for col in available_features:
            if col in self.text_columns:
                X_test[col] = X_test[col].fillna('missing').astype(str)
            elif col in self.label_encoders:
                # –ü—Ä–∏–º–µ–Ω—è–µ–º —Å–æ—Ö—Ä–∞–Ω–µ–Ω–Ω—ã–π encoder
                X_test[col] = self.label_encoders[col].transform(X_test[col].fillna('missing'))

        # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
        probas = model.predict_proba(X_test)[:, 1]
        preds = model.predict(X_test)

        ids = current_df['id'].to_list()

        # –û–ø—Ä–µ–¥–µ–ª—è–µ–º confident predictions
        low_confident = (probas < self.hard_threshold_low) & (preds == 0)
        high_confident = (probas > self.hard_threshold_high) & (preds == 1)
        confident_mask = low_confident | high_confident

        # –°–æ—Ö—Ä–∞–Ω—è–µ–º confident predictions
        for i, test_id in enumerate(ids):
            if confident_mask[i]:
                all_predictions[test_id] = preds[i]

        # –û—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ –Ω–µ—É–≤–µ—Ä–µ–Ω–Ω—ã–µ –¥–ª—è —Å–ª–µ–¥—É—é—â–µ–π —Å—Ç–∞–¥–∏–∏
        remaining_ids = [ids[i] for i in range(len(confident_mask)) if not confident_mask[i]]
        current_df = current_df.filter(pl.col('id').is_in(remaining_ids))

        confident_count = np.sum(confident_mask)
        print(f"Confident predictions: {confident_count}, Remaining: {len(remaining_ids)}")

    # –§–∏–Ω–∞–ª—å–Ω—ã–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
    test_ids = test_df['id'].to_list()
    final_predictions = [all_predictions.get(test_id, 0) for test_id in test_ids]

    # –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
    coverage = len(all_predictions) / len(test_ids) * 100
    unique_vals, counts = np.unique(final_predictions, return_counts=True)

    print(f"\n–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π:")
    print(f"Coverage: {coverage:.1f}%")
    for val, count in zip(unique_vals, counts):
        percentage = count / len(final_predictions) * 100
        print(f"Class {val}: {count} ({percentage:.1f}%)")

    print("‚úÖ –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ï –ó–ê–í–ï–†–®–ï–ù–û!")
    return np.array(final_predictions)

OptimizedTextAwarePipeline.predict = predict

In [None]:
## –ó–∞–ø—É—Å–∫

In [None]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –∑–∞–ø—É—Å–∫–∞ –ø–æ–ª–Ω–æ–≥–æ –ø—Ä–æ—Ü–µ—Å—Å–∞
def run_optimized_pipeline(train_path: str, test_path: str):

    try:
        print("üî• –ó–ê–ì–†–£–ó–ö–ê –û–ü–¢–ò–ú–ò–ó–ò–†–û–í–ê–ù–ù–û–ì–û PIPELINE")

        # –°–æ–∑–¥–∞–Ω–∏–µ –∏ –æ–±—É—á–µ–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞
        pipeline = OptimizedTextAwarePipeline()

        # –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
        print("üìä –ó–∞–≥—Ä—É–∑–∫–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö...")
        train_df = pl.read_csv(train_path)
        print(f"–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–π –≤—ã–±–æ—Ä–∫–∏: {len(train_df)}")

        # –û–±—É—á–µ–Ω–∏–µ
        pipeline.fit(train_df)

        # –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
        print("üß™ –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö...")

        app_test = 'ml_ozon_—Åounterfeit_new_test.csv'
        df_test_old = pl.read_csv(test_path)
        df_test_append = pl.read_csv(app_test)

        test_df = df_test_old.vstack(df_test_append)


        print(f"–†–∞–∑–º–µ—Ä —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–∏: {len(test_df)}")

        # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
        predictions = pipeline.predict(test_df)

        # –°–æ–∑–¥–∞–Ω–∏–µ submission
        submission = pd.DataFrame({
            'id': test_df['id'].to_list(),
            'prediction': predictions
        })

        submission.to_csv('optimized_submission.csv', index=False)
        print("\n‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ optimized_submission.csv")
        print("üéâ PIPELINE –í–´–ü–û–õ–ù–ï–ù –£–°–ü–ï–®–ù–û!")

        return pipeline, predictions

    except Exception as e:
        print(f"‚ùå –û—à–∏–±–∫–∞ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è: {e}")
        import traceback
        traceback.print_exc()
        return None, None
OptimizedTextAwarePipeline.run_optimized_pipeline = run_optimized_pipeline




if __name__ == "__main__":

    train_path = '/content/drive/MyDrive/data/ml_ozon_—Åounterfeit_train.csv' # –£–∫–∞–∂–∏—Ç–µ –ø—É—Ç—å –∫ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–º –¥–∞–Ω–Ω—ã–º
    test_path = '/content/drive/MyDrive/data/ml_ozon_—Åounterfeit_test.csv'    # –£–∫–∞–∂–∏—Ç–µ –ø—É—Ç—å –∫ —Ç–µ—Å—Ç–æ–≤—ã–º –¥–∞–Ω–Ω—ã–º

    pipeline, predictions = run_optimized_pipeline(train_path, test_path)

üî• –ó–ê–ì–†–£–ó–ö–ê –û–ü–¢–ò–ú–ò–ó–ò–†–û–í–ê–ù–ù–û–ì–û PIPELINE
üìä –ó–∞–≥—Ä—É–∑–∫–∞ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö...
–†–∞–∑–º–µ—Ä —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω–æ–π –≤—ã–±–æ—Ä–∫–∏: 197198
üöÄ –ó–ê–ü–£–°–ö –û–ü–¢–ò–ú–ò–ó–ò–†–û–í–ê–ù–ù–û–ì–û TEXT-AWARE PIPELINE
–ù–∞–π–¥–µ–Ω–æ —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫: 4

–û–ë–£–ß–ï–ù–ò–ï –°–¢–ê–î–ò–ò HARD
–†–∞–∑–º–µ—Ä –≤—ã–±–æ—Ä–∫–∏: 197198
Feature engineering –¥–ª—è —Å—Ç–∞–¥–∏–∏ hard...
Continuous: 34, Categorical: 3
–°–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–æ 135 –Ω–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –¥–ª—è hard
Feature selection –¥–ª—è hard (–º–∞–∫—Å. 25 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤)...
