In [53]:
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import sklearn
from math import sqrt
from optuna.integration import CatBoostPruningCallback
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

In [32]:
train = pd.read_csv('https://www.dropbox.com/scl/fi/9hb4r3uce0mqz8fkpja17/text_classification_train.csv?rlkey=w42y98wa401gelzou08pp582k&dl=1')

In [33]:
train.to_csv('../data/text_classification.csv', index=False)

In [54]:
train = pd.read_csv('../data/text_classification.csv')

In [55]:
train['text_len'] = train['text'].apply(lambda x: len(x))

In [56]:
class_names = np.unique(train['category'])
train['category'] = train['category'].replace(class_names, np.arange(train['category'].nunique()))

In [57]:
cols2drop = ['category', 'text']
numerical_features = [c for c in train.columns if c not in cols2drop]

In [58]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(cols2drop, axis=1), 
                                                    train['category'],
                                                    test_size=.25,
                                                    stratify=train['category'],
                                                    random_state=42)

In [34]:
def fit_catboost(trial, cat_train, cat_val):
    X = train.drop(cols2drop, axis=1, errors="ignore")
    y = train['category']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    param = {
        'iterations' : 400, # Можно не перебирать, есть Early-Stopping
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.01),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced", "None"]),
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "used_ram_limit": "14gb",
        "eval_metric": "Accuracy", # Тоже стоит заранее определиться
    }

    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        
    
    clf = CatBoostClassifier(
        **param, thread_count=-1, random_seed=42
    )
    
    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_test, y_test),
        verbose=0,
        plot=False,
        early_stopping_rounds=5,
        callbacks=[pruning_callback],
    )  # Добавляем callback в fit


    pruning_callback.check_pruned()

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    return accuracy

In [35]:
def objective(trial, return_models=False):
    n_splits = 3
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    X_cat_train = train.drop(cols2drop, axis=1, errors="ignore")
    y_cat_train = train['category']

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X_cat_train):
        train_data = X_cat_train.iloc[train_idx, :], y_cat_train.iloc[train_idx]
        valid_data = X_cat_train.iloc[valid_idx, :], y_cat_train.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(accuracy_score(y_pred, valid_data[1]))
        models.append(model)
        break
         

    result = np.mean(scores)
    
    if return_models:
        return result, models
    else:
        return result

In [36]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
                            direction="maximize",)
study.optimize(objective,
               n_trials=50,
               show_progress_bar=True,)

[I 2023-12-01 22:29:31,014] A new study created in memory with name: no-name-6069b0b3-d9eb-42cf-b681-51503e8cfa4e


  0%|          | 0/50 [00:00<?, ?it/s]

  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")


[W 2023-12-01 22:30:01,593] Trial 0 failed with parameters: {'learning_rate': 0.005775915991358214, 'l2_leaf_reg': 15, 'colsample_bylevel': 0.5419874372868106, 'auto_class_weights': 'None', 'depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'} because of the following error: NameError("name 'accuracy_score' is not defined").
Traceback (most recent call last):
  File "/home/hallteon/.local/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_59823/3754814663.py", line 14, in objective
    model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
  File "/tmp/ipykernel_59823/3931906895.py", line 52, in fit_catboost
    accuracy = accuracy_score(y_pred, y_test)
NameError: name 'accuracy_score' is not defined
[W 2023-12-01 22:30:01,594] Trial 0 failed with value None.


NameError: name 'accuracy_score' is not defined

In [59]:
params_cat = {
             'n_estimators' : 700,
              # 'learning_rate': .03,
              'depth' : 3,
              'verbose': False,
              'use_best_model': True,
              'text_features': [],
              # 'train_dir' : '/home/jovyan/work/catboost',
              'border_count' : 64,
              'l2_leaf_reg' : 1,
              'bagging_temperature' : 2,
              'rsm' : 0.51,
              'loss_function': 'MultiClass',
              'auto_class_weights' : 'Balanced', #try not balanced
              'random_state': 42,
              'use_best_model': False,
              # 'custom_metric' : ['AUC', 'MAP'] # Не работает внутри sklearn.Pipelines
         }



cat_model = CatBoostClassifier(**params_cat)

In [60]:
params_lgbm = {
    "num_leaves": 200,
    "n_estimators": 1500,
    # "max_depth": 7,
    "min_child_samples": None,
    "learning_rate": 0.001,
    "min_data_in_leaf": 5,
    "feature_fraction": 0.98,
    # "categorical_feature": cat_cols,
    'reg_alpha' : 3.0,
    'reg_lambda' : 5.0,
}

lgbm_model = LGBMClassifier(**params_lgbm)

In [61]:
params_xgb = {
    "eta": 0.05,
    'n_estimators' : 1500,
    "max_depth": 6,
    "subsample": 0.7,
    # "colsample_bytree": 0.95,
    'min_child_weight' : 0.1,
    'gamma': .01,
    'reg_lambda' : 0.1,
    'reg_alpha' : 0.5,
    "objective": "reg:linear",
    "eval_metric": "mae",
    'tree_method' : 'hist', # Supported tree methods for cat fs are `gpu_hist`, `approx`, and `hist`.
    'enable_categorical' : True
    
}

xgb_model = XGBClassifier(**params_xgb)

In [62]:
numerical_features = [c for c in train.columns if c not in cols2drop]

In [63]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, numerical_features),
    ("categorical", categorical_transformer, [])])

In [64]:
estimators = [
    
    
    ("ExtraTrees",  make_pipeline(preprocessor, ExtraTreesClassifier(n_estimators = 10_000, max_depth = 6, min_samples_leaf = 2, 
                                                              bootstrap = True, class_weight = 'balanced', # ccp_alpha = 0.001, 
                                                              random_state = 75, verbose=False, n_jobs=-1,))),
    

    ("XGBoost", xgb_model),
    ("LightGBM", lgbm_model),
    ("CatBoost", cat_model),
    
    ("Random_forest",  make_pipeline(preprocessor, RandomForestClassifier(n_estimators = 15_000, max_depth = 7, 
                                                              min_samples_leaf = 2,
                                                              warm_start = True, n_jobs=-1,
                                                              random_state = 75, verbose=False))),
]

# в качестве мета-модели будем использовать LogisticRegression
meta_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(verbose=False),
    # final_estimator=RandomForestClassifier(n_estimators = 10_000, 
                                           # max_depth = 5,
                                           # verbose=False),
    n_jobs=-1,
    verbose=False,
)

stacking_classifier = meta_model

In [1]:
stacking_classifier.fit(X_train, y_train)

NameError: name 'stacking_classifier' is not defined

In [18]:
train

Unnamed: 0,category,text,ruBert-base_text_feature_0,ruBert-base_text_feature_1,ruBert-base_text_feature_2,ruBert-base_text_feature_3,ruBert-base_text_feature_4,ruBert-base_text_feature_5,ruBert-base_text_feature_6,ruBert-base_text_feature_7,...,labse_text_feature_759,labse_text_feature_760,labse_text_feature_761,labse_text_feature_762,labse_text_feature_763,labse_text_feature_764,labse_text_feature_765,labse_text_feature_766,labse_text_feature_767,text_len
0,5,Ледник Пасторури это цирковой ледник расположе...,0.272156,0.155383,0.060285,0.363159,-0.140391,0.507753,-0.226326,0.431878,...,-0.027475,0.030528,-0.052218,0.042459,-0.012714,-0.055370,-0.012433,-0.016283,-0.006994,139
1,8,Главные участники предстоящего Betokenoid 274 ...,0.439223,0.343683,0.093642,0.245294,0.089770,0.424717,-0.071487,0.185970,...,0.042258,-0.027394,-0.033566,0.016021,-0.022054,-0.040366,0.007392,-0.029070,-0.011284,202
2,5,Ttokenoid Btokenoid – карта с которой можно не...,-0.040338,0.058095,-0.091063,0.296028,-0.137103,0.931456,-0.169060,0.131503,...,0.011548,-0.046034,0.024588,-0.013670,-0.047028,0.009395,-0.000488,-0.060260,0.006563,139
3,1,В Сильверстоуне произошли крупные обновления а...,0.444181,0.218742,0.247859,0.234885,0.006668,0.407703,-0.115768,0.433781,...,-0.043454,-0.000961,-0.012203,-0.047922,-0.054657,-0.053768,0.018481,-0.039148,-0.038874,216
4,5,На протяжении более чем 30 лет Вестсайд являет...,-0.126253,-0.115856,0.131131,0.052595,0.060591,0.420976,0.090776,0.246287,...,-0.008138,-0.013958,-0.038482,-0.002943,-0.035970,-0.010830,-0.005132,-0.047990,-0.005869,112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,9,Яркий открытый шлем. ⠀ Внешняя оболочка из выс...,-0.052873,0.115095,0.067304,0.238675,0.042321,0.694844,0.009075,0.298883,...,-0.003430,-0.066267,-0.013982,0.003619,-0.044228,-0.037047,0.005730,-0.076227,-0.048963,818
7496,8,Никто не рождается с идеальным телом. Но если ...,-0.356527,0.173754,0.007272,0.293671,-0.252398,0.243566,0.163168,0.252569,...,-0.004166,-0.044707,-0.062995,-0.009477,0.011715,-0.002244,-0.034123,-0.042895,-0.047827,91
7497,9,Друзья всем привет 33 В феврале продал свою Ho...,0.177330,0.278536,0.013133,0.141096,-0.121019,0.547267,0.000883,0.397169,...,0.037294,-0.042980,-0.037077,-0.034269,-0.031412,-0.006170,-0.011738,-0.051788,-0.046078,430
7498,10,Даниил Медведев во время четвертого сета Как ж...,-0.013558,0.128117,0.361276,0.347715,-0.195470,0.513713,-0.136279,0.456701,...,0.004140,0.003558,-0.070606,-0.057361,0.023872,0.032804,-0.034576,-0.001304,-0.016874,100


In [45]:
!pip install optuna -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
