In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
import warnings
import math

df_train = pd.read_csv('../data/train.csv', index_col=0)
df_val = pd.read_csv('../data/val.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

In [2]:
import hydra
from omegaconf import OmegaConf
from hydra import compose, initialize

with initialize(config_path="../src/config"):
    cfg = compose(config_name="config")
print(OmegaConf.to_yaml(cfg))

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize(config_path="../src/config"):


preprocessing:
  text:
    apply_cleaning: true
    apply_lemmatization: true
    add_fraud_indicators: true
    max_length: 1024
    nltk_data_dir: ${oc.env:NLTK_DATA,${hydra:runtime.cwd}/nltk_data}
  image:
    size:
    - 224
    - 224
    augmentations:
    - transform:
        _target_: torchvision.transforms.RandomResizedCrop
        size: 224
        scale:
        - 0.8
        - 1.0
      probability: 0.5
    - transform:
        _target_: torchvision.transforms.RandomHorizontalFlip
        p: 0.5
      probability: 0.1
    - transform:
        _target_: torchvision.transforms.RandomRotation
        degrees: 10
      probability: 0.3
    - transform:
        _target_: torchvision.transforms.ColorJitter
        brightness: 0.1
        contrast: 0.1
        saturation: 0.1
        hue: 0.05
      probability: 0.3
    compute_clip_similarity: false
    clip_model: openai/clip-vit-base-patch32
  tabular:
    categorical_cols:
    - brand_name
    - CommercialTypeName4
    numerica

In [3]:
from typing import List, Dict, Any

class TabularProcessor:
    def __init__(self,
                 categorical_cols: List[str] = None,
                 numerical_cols: List[str] = None,
                 scaling: str = "standard"):
        self.categorical_cols = categorical_cols or []
        self.numerical_cols = numerical_cols or []
        self.scaling = scaling

        # Learned state
        self.category_value_to_index: Dict[str, Dict[Any, int]] = {}
        self.category_cardinalities: List[int] = []
        self.num_stats: Dict[str, Dict[str, float]] = {}
        self.num_categorical_features: int = 0
        self.num_continuous_features: int = 0

    def fit(self, df: pd.DataFrame) -> None:
        """
        Fit encoders and scalers on training dataframe.

        - Categorical columns: build value->index mapping with 0 reserved for unknown/missing
        - Numerical columns: compute mean/std (standard) or min/max (minmax)
        """
        # Fit categoricals
        self.category_value_to_index = {}
        self.category_cardinalities = []
        for col in self.categorical_cols:
            # Get unique values excluding NaNs
            if col not in df.columns:
                # still register unknown-only category
                self.category_value_to_index[col] = {}
                self.category_cardinalities.append(1)
                continue
            values = pd.Series(df[col]).astype(str)
            # Include only non-null values
            unique_values = pd.Index(values[values != 'nan'].unique())
            # Reserve 0 for unknown
            mapping = {val: i + 1 for i, val in enumerate(unique_values)}
            self.category_value_to_index[col] = mapping
            # cardinality includes index 0 for unknown
            self.category_cardinalities.append(len(mapping) + 1)

        # Fit numerical scaler
        self.num_stats = {}
        for col in self.numerical_cols:
            if col not in df.columns:
                self.num_stats[col] = {"mean": 0.0, "std": 1.0, "min": 0.0, "max": 1.0}
                continue
            series = pd.to_numeric(df[col], errors='coerce')
            if self.scaling == "standard":
                mean = float(series.mean()) if not math.isnan(series.mean()) else 0.0
                std = float(series.std(ddof=0)) if not math.isnan(series.std(ddof=0)) else 1.0
                if std == 0.0:
                    std = 1.0
                self.num_stats[col] = {"mean": mean, "std": std}
            elif self.scaling == "minmax":
                min_v = float(series.min()) if not math.isnan(series.min()) else 0.0
                max_v = float(series.max()) if not math.isnan(series.max()) else 1.0
                if max_v == min_v:
                    max_v = min_v + 1.0
                self.num_stats[col] = {"min": min_v, "max": max_v}
            else:
                # No scaling
                self.num_stats[col] = {}

        # Store feature counts
        self.num_categorical_features = len(self.categorical_cols)
        self.num_continuous_features = len(self.numerical_cols)

    @property
    def categories_cardinalities(self) -> List[int]:
        return list(self.category_cardinalities)

    @property
    def num_continuous(self) -> int:
        return len(self.numerical_cols)

    def _encode_category_value(self, col: str, value: Any) -> int:
        mapping = self.category_value_to_index.get(col, {})
        if pd.isna(value):
            return 0
        key = str(value)
        return mapping.get(key, 0)

    def _scale_numeric_value(self, col: str, value: Any) -> float:
        if value is None or (isinstance(value, float) and math.isnan(value)):
            # impute with mean (standard) or min (minmax) or 0
            if self.scaling == "standard":
                return float(self.num_stats.get(col, {}).get("mean", 0.0))
            elif self.scaling == "minmax":
                return float(self.num_stats.get(col, {}).get("min", 0.0))
            else:
                return 0.0
        try:
            v = float(value)
        except Exception:
            v = 0.0
        if self.scaling == "standard":
            mean = self.num_stats.get(col, {}).get("mean", 0.0)
            std = self.num_stats.get(col, {}).get("std", 1.0)
            return (v - mean) / std
        if self.scaling == "minmax":
            min_v = self.num_stats.get(col, {}).get("min", 0.0)
            max_v = self.num_stats.get(col, {}).get("max", 1.0)
            return (v - min_v) / (max_v - min_v)
        return v

    def __call__(self, row: Dict[str, Any]) -> np.array:
        # Categorical indices
        cat_indices: List[int] = []
        for col in self.categorical_cols:
            val = row.get(col, None)
            idx = self._encode_category_value(col, val)
            cat_indices.append(idx)

        # Continuous values
        cont_values: List[float] = []
        for col in self.numerical_cols:
            val = row.get(col, None)
            scaled = self._scale_numeric_value(col, val)
            cont_values.append(float(scaled))

        categorical = np.array(cat_indices)
        continuous = np.array(cont_values) if cont_values else np.zeros(0)

        return np.concat([categorical, continuous])

In [4]:
processor = TabularProcessor(categorical_cols=list(cfg.preprocessing.tabular.categorical_cols),
            numerical_cols=list(cfg.preprocessing.tabular.numerical_cols),
            scaling=cfg.preprocessing.tabular.get('scaling', 'standard'))

processor.fit(df_train)

In [5]:
X_train, y_train = df_train.drop(columns='resolution'), df_train['resolution']
X_val, y_val = df_val.drop(columns='resolution'), df_val['resolution']
X_test = df_test.copy()
X_train = np.stack(X_train.apply(processor, axis=1))
X_val = np.stack(X_val.apply(processor, axis=1))
X_test = np.stack(X_test.apply(processor, axis=1))

In [6]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_pred)

print(f"Validation accuracy: {val_accuracy:.4f}")
print("Classification report:")
print(classification_report(y_val, val_pred))
print()


Validation accuracy: 0.9712
Classification report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     36830
           1       0.90      0.64      0.75      2610

    accuracy                           0.97     39440
   macro avg       0.94      0.82      0.87     39440
weighted avg       0.97      0.97      0.97     39440




In [None]:
import shap

# 2. Compute SHAP values using TreeExplainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)  # returns per-sample contribution per feature :contentReference[oaicite:0]{index=0}

# 3. Calculate global importance: mean absolute SHAP value per feature
feature_names = X_train.columns
# If shap_values is a 2D array (num_samples × num_features)
importance = np.abs(shap_values).mean(axis=0)
feature_importance = pd.Series(importance, index=feature_names).sort_values(ascending=False)


feature_importance

In [None]:

# 4. Visualize top features
shap.summary_plot(shap_values, X_train)      # Beeswarm plot :contentReference[oaicite:1]{index=1}
shap.plots.bar(shap_values)

In [49]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    random_seed=42,
    verbose=100
)
model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_pred)

print(f"Validation accuracy: {val_accuracy:.4f}")
print("Classification report:")
print(classification_report(y_val, val_pred))
print()

0:	learn: 0.5501018	test: 0.5507720	best: 0.5507720 (0)	total: 36.2ms	remaining: 18s
100:	learn: 0.1187064	test: 0.1232029	best: 0.1232029 (100)	total: 3.59s	remaining: 14.2s
200:	learn: 0.1012097	test: 0.1071754	best: 0.1071754 (200)	total: 8.03s	remaining: 11.9s
300:	learn: 0.0908517	test: 0.0985068	best: 0.0985068 (300)	total: 13.6s	remaining: 8.96s
400:	learn: 0.0839749	test: 0.0938315	best: 0.0938275 (399)	total: 18s	remaining: 4.46s
499:	learn: 0.0784132	test: 0.0901725	best: 0.0901725 (499)	total: 22.3s	remaining: 0us

bestTest = 0.09017252521
bestIteration = 499

Validation accuracy: 0.9682
Classification report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     36830
           1       0.83      0.65      0.73      2610

    accuracy                           0.97     39440
   macro avg       0.90      0.82      0.86     39440
weighted avg       0.97      0.97      0.97     39440




In [1]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'iterations': [300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [32, 64, 128]
}

catboost_model = CatBoostClassifier(
    random_seed=42,
    verbose=100
)

grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    cv=3,
    scoring='f1_macro',  # или 'accuracy', 'roc_auc' — по задаче
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

# Используем лучшие параметры для финального обучения
best_model = grid_search.best_estimator_
val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_pred)

print(f"Validation accuracy: {val_accuracy:.4f}")
print("Classification report:")
print(classification_report(y_val, val_pred))

NameError: name 'X_train' is not defined

In [None]:
model = best_model

In [51]:
test_predictions = model.predict(X_test)

submission = pd.DataFrame({
    'id': df_test.index,
    'prediction': test_predictions
})

submission.to_csv('submission.csv', index=False)


print(f"Создан файл submission.csv с {len(submission)} предсказаниями")
print(f"Распределение предсказаний:")
print(submission['prediction'].value_counts())
print()

Создан файл submission.csv с 22760 предсказаниями
Распределение предсказаний:
prediction
0    21882
1      878
Name: count, dtype: int64

