In [1]:
import numpy as np
import pandas as pd
import random

import torch
import optuna
import nltk
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
)
from optuna.integration import CatBoostPruningCallback
from nltk.corpus import stopwords
from pandarallel import pandarallel
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from transformers import Trainer, TrainingArguments



RANDOM_SEED = 121

def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False
seed_all(RANDOM_SEED)

nltk.download("punkt")
stop_words = stopwords.words("english")

pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package punkt to /home/gleb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Catboost + text_features

In [2]:
catboost_text_result = []

df = pd.read_csv("../data/interim/df_preprocessed_text.csv", low_memory=False)
df.sample(1)

Unnamed: 0,name,file_name,num_file,level,text,stemmer,lemm,no_stop,stop_stremm,stop_lemm
2942,Apocalypse Now,Apocalypse Now Redux (1979) UHD 4K BRD 2160p F...,False,hard,This is the end Beautiful friend This is the e...,thi end beauti friend thi end my friend the en...,This end Beautiful friend This end My friend T...,This end Beautiful friend This end My friend T...,thi is the end beauti friend thi is the end my...,This is the end Beautiful friend This is the e...


In [3]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=0)
y_train, X_train = train_df["level"], train_df.drop(["level"], axis=1)
y_test, X_test = test_df["level"], test_df.drop(["level"], axis=1)

In [4]:
N_SPLITS = 5


def train_cv(cols: list) -> pd.DataFrame:
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    i = 0
    mean_score = {}
    for train_fold, val_fold in skf.split(X_train, y_train):
        cat = CatBoostClassifier(
            task_type="GPU",
            devices="0",
            iterations=10000,
            early_stopping_rounds=300,
            loss_function="MultiClass",
            l2_leaf_reg=11,
            auto_class_weights="Balanced",
            eval_metric="AUC",
            random_seed=RANDOM_SEED,
            verbose=0,
        )

        train_dataset = Pool(
            data=X_train.iloc[train_fold][cols],
            label=y_train.iloc[train_fold],
            text_features=cols,
        )

        eval_dataset = Pool(
            data=X_train.iloc[val_fold][cols],
            label=y_train.iloc[val_fold],
            text_features=cols,
        )

        fit_model = cat.fit(
            train_dataset, eval_set=eval_dataset, use_best_model=True, plot=False
        )

        roc_auc = roc_auc_score(
            y_train.iloc[val_fold],
            fit_model.predict_proba(X_train.iloc[val_fold][cols]),
            average="macro",
            multi_class="ovo",
        )

        f1 = f1_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold][cols]),
            average="macro",
        )

        precision = precision_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold][cols]),
            average="macro",
        )

        recall = recall_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold][cols]),
            average="macro",
        )

        accuracy = accuracy_score(
            y_train.iloc[val_fold], fit_model.predict(X_train.iloc[val_fold][cols])
        )

        for score, value_score in zip(
            ["roc_auc", "f1", "precision", "recall", "accuracy"],
            [roc_auc, f1, precision, recall, accuracy],
        ):
            mean_score[score] = mean_score.get(score, 0) + value_score / N_SPLITS
            print(f"fold: {i + 1} {score}: {value_score}")

        print()
        i += 1
    return pd.DataFrame(mean_score, index=[cols])

In [5]:
catboost_text_result.append(train_cv(["text"]))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.733781703807803
fold: 1 f1: 0.47111216728859323
fold: 1 precision: 0.477055612650411
fold: 1 recall: 0.5328925572606616
fold: 1 accuracy: 0.47644927536231885



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.7201593771688221
fold: 2 f1: 0.44787633741280614
fold: 2 precision: 0.47015606899744683
fold: 2 recall: 0.5326113388178448
fold: 2 accuracy: 0.45108695652173914



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.7536957561651968
fold: 3 f1: 0.5113250405828479
fold: 3 precision: 0.5176069010180672
fold: 3 recall: 0.573656570754978
fold: 3 accuracy: 0.5181159420289855



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.715671460381358
fold: 4 f1: 0.4947251655197991
fold: 4 precision: 0.4884680134680135
fold: 4 recall: 0.5253107986108774
fold: 4 accuracy: 0.5072463768115942



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.738712857590088
fold: 5 f1: 0.43182780267062726
fold: 5 precision: 0.44877041242647103
fold: 5 recall: 0.5390538519703331
fold: 5 accuracy: 0.4355716878402904



In [6]:
catboost_text_result.append(train_cv(["no_stop"]))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.7438861305921302
fold: 1 f1: 0.5173683461890705
fold: 1 precision: 0.5129795243342734
fold: 1 recall: 0.5603547412752731
fold: 1 accuracy: 0.5271739130434783



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.7395914587793165
fold: 2 f1: 0.5486402502941625
fold: 2 precision: 0.5732510288065843
fold: 2 recall: 0.536404253767312
fold: 2 accuracy: 0.605072463768116



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.764015696476494
fold: 3 f1: 0.5408254891833301
fold: 3 precision: 0.5359631273893205
fold: 3 recall: 0.5769647619258683
fold: 3 accuracy: 0.5579710144927537



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.7206580755281201
fold: 4 f1: 0.49239398926744166
fold: 4 precision: 0.4868827160493827
fold: 4 recall: 0.5232765552271984
fold: 4 accuracy: 0.5054347826086957



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.7610104243698245
fold: 5 f1: 0.444981146829045
fold: 5 precision: 0.4664818572936115
fold: 5 recall: 0.5574480946618492
fold: 5 accuracy: 0.4482758620689655



In [7]:
catboost_text_result.append(train_cv(["stemmer"]))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.7480846693089168
fold: 1 f1: 0.5051721595803295
fold: 1 precision: 0.5113247863247863
fold: 1 recall: 0.5672697824767825
fold: 1 accuracy: 0.5108695652173914



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.7140033249945296
fold: 2 f1: 0.4866226509981541
fold: 2 precision: 0.4923074551680411
fold: 2 recall: 0.533896597332034
fold: 2 accuracy: 0.5036231884057971



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.7590961500394764
fold: 3 f1: 0.5456729438500552
fold: 3 precision: 0.5387253208007925
fold: 3 recall: 0.5794317904454958
fold: 3 accuracy: 0.5634057971014492



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.7215830103989725
fold: 4 f1: 0.5007199176194765
fold: 4 precision: 0.49709168087524863
fold: 4 recall: 0.5362363393342653
fold: 4 accuracy: 0.5126811594202898



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.7498718897962559
fold: 5 f1: 0.5104308802549903
fold: 5 precision: 0.5046520742009464
fold: 5 recall: 0.5489027753416355
fold: 5 accuracy: 0.5245009074410163



In [8]:
catboost_text_result.append(train_cv(["stop_stremm"]))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.7575611668911151
fold: 1 f1: 0.5384483281420257
fold: 1 precision: 0.5412947959381548
fold: 1 recall: 0.5918454170792833
fold: 1 accuracy: 0.5489130434782609



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.720750800607853
fold: 2 f1: 0.5121532483961326
fold: 2 precision: 0.5074456127762971
fold: 2 recall: 0.5189757031776933
fold: 2 accuracy: 0.5489130434782609



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.7545240288788876
fold: 3 f1: 0.5356011998604376
fold: 3 precision: 0.5273769885312616
fold: 3 recall: 0.5712878374318059
fold: 3 accuracy: 0.5471014492753623



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.718101190836817
fold: 4 f1: 0.503789918001278
fold: 4 precision: 0.5006784260515604
fold: 4 recall: 0.5432789636780185
fold: 4 accuracy: 0.5144927536231884



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.7521999972921947
fold: 5 f1: 0.522401698345897
fold: 5 precision: 0.5139112100977634
fold: 5 recall: 0.55436217985591
fold: 5 accuracy: 0.5372050816696915



In [9]:
catboost_text_result.append(train_cv(["lemm"]))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.7438861305921302
fold: 1 f1: 0.5173683461890705
fold: 1 precision: 0.5129795243342734
fold: 1 recall: 0.5603547412752731
fold: 1 accuracy: 0.5271739130434783



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.7395914587793165
fold: 2 f1: 0.5486402502941625
fold: 2 precision: 0.5732510288065843
fold: 2 recall: 0.536404253767312
fold: 2 accuracy: 0.605072463768116



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.764015696476494
fold: 3 f1: 0.5408254891833301
fold: 3 precision: 0.5359631273893205
fold: 3 recall: 0.5769647619258683
fold: 3 accuracy: 0.5579710144927537



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.7206580755281201
fold: 4 f1: 0.49239398926744166
fold: 4 precision: 0.4868827160493827
fold: 4 recall: 0.5232765552271984
fold: 4 accuracy: 0.5054347826086957



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.7610104243698245
fold: 5 f1: 0.444981146829045
fold: 5 precision: 0.4664818572936115
fold: 5 recall: 0.5574480946618492
fold: 5 accuracy: 0.4482758620689655



In [10]:
catboost_text_result.append(train_cv(["stop_lemm"]))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.733781703807803
fold: 1 f1: 0.47111216728859323
fold: 1 precision: 0.477055612650411
fold: 1 recall: 0.5328925572606616
fold: 1 accuracy: 0.47644927536231885



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.7201593771688221
fold: 2 f1: 0.44787633741280614
fold: 2 precision: 0.47015606899744683
fold: 2 recall: 0.5326113388178448
fold: 2 accuracy: 0.45108695652173914



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.7536957561651968
fold: 3 f1: 0.5113250405828479
fold: 3 precision: 0.5176069010180672
fold: 3 recall: 0.573656570754978
fold: 3 accuracy: 0.5181159420289855



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.715671460381358
fold: 4 f1: 0.4947251655197991
fold: 4 precision: 0.4884680134680135
fold: 4 recall: 0.5253107986108774
fold: 4 accuracy: 0.5072463768115942



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.738712857590088
fold: 5 f1: 0.43182780267062726
fold: 5 precision: 0.44877041242647103
fold: 5 recall: 0.5390538519703331
fold: 5 accuracy: 0.4355716878402904



In [11]:
catboost_text_result = pd.concat(catboost_text_result)
catboost_text_result

Unnamed: 0,roc_auc,f1,precision,recall,accuracy
text,0.732404,0.471373,0.480411,0.540705,0.477694
no_stop,0.745832,0.508842,0.515112,0.55089,0.528786
stemmer,0.738528,0.509724,0.50882,0.553147,0.523016
stop_stremm,0.740627,0.522479,0.518141,0.55595,0.539325
lemm,0.745832,0.508842,0.515112,0.55089,0.528786
stop_lemm,0.732404,0.471373,0.480411,0.540705,0.477694


Как видно из таблицы лучший результат показывает текст со словами привиденными к его основе и без стоп-слов (stop_stemm)

## Catboost + embeddings (like features)

In [12]:
catboost_embeddings_like_features = []
train_index = y_train.index
test_index = y_test.index

In [13]:
N_SPLITS = 5


def train_emb_like_feature(emb_name: str) -> pd.DataFrame:
    df = pd.read_csv(f"../data/interim/{emb_name}", low_memory=False)
    X_train = df.iloc[train_index].drop("target", axis=1)
    y_train = df.loc[train_index, "target"]

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    i = 0
    mean_score = {}
    for train_fold, val_fold in skf.split(X_train, y_train):
        cat = CatBoostClassifier(
            task_type="GPU",
            devices="0",
            iterations=10000,
            early_stopping_rounds=300,
            loss_function="MultiClass",
            l2_leaf_reg=11,
            auto_class_weights="Balanced",
            eval_metric="AUC",
            random_seed=RANDOM_SEED,
            verbose=0,
        )

        train_dataset = Pool(
            data=X_train.iloc[train_fold], label=y_train.iloc[train_fold]
        )

        eval_dataset = Pool(data=X_train.iloc[val_fold], label=y_train.iloc[val_fold])

        fit_model = cat.fit(
            train_dataset, eval_set=eval_dataset, use_best_model=True, plot=False
        )

        roc_auc = roc_auc_score(
            y_train.iloc[val_fold],
            fit_model.predict_proba(X_train.iloc[val_fold]),
            average="macro",
            multi_class="ovo",
        )

        f1 = f1_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold]),
            average="macro",
        )

        precision = precision_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold]),
            average="macro",
        )

        recall = recall_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold]),
            average="macro",
        )

        accuracy = accuracy_score(
            y_train.iloc[val_fold], fit_model.predict(X_train.iloc[val_fold])
        )

        for score, value_score in zip(
            ["roc_auc", "f1", "precision", "recall", "accuracy"],
            [roc_auc, f1, precision, recall, accuracy],
        ):
            mean_score[score] = mean_score.get(score, 0) + value_score / N_SPLITS
            print(f"fold: {i + 1} {score}: {value_score}")

        print()
        i += 1
    return pd.DataFrame(mean_score, index=[emb_name])

In [14]:
catboost_embeddings_like_features.append(
    train_emb_like_feature("df_emb_bert-base-uncased.csv")
)

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.6432666980969631
fold: 1 f1: 0.4210761738435475
fold: 1 precision: 0.42643616287094543
fold: 1 recall: 0.43831172688425757
fold: 1 accuracy: 0.4692028985507246



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.6374180688112957
fold: 2 f1: 0.4525012141816416
fold: 2 precision: 0.4645348252198587
fold: 2 recall: 0.45304118195961135
fold: 2 accuracy: 0.5398550724637681



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.6269917403390672
fold: 3 f1: 0.42024182154069184
fold: 3 precision: 0.4189255366586376
fold: 3 recall: 0.42604741295937737
fold: 3 accuracy: 0.4963768115942029



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.6194123006015891
fold: 4 f1: 0.41976140544624824
fold: 4 precision: 0.42999712705595056
fold: 4 recall: 0.4168940109606951
fold: 4 accuracy: 0.5054347826086957



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.6591621505692555
fold: 5 f1: 0.4540522670929478
fold: 5 precision: 0.4535611974047637
fold: 5 recall: 0.45786424198582276
fold: 5 accuracy: 0.5390199637023594



In [15]:
catboost_embeddings_like_features.append(
    train_emb_like_feature("df_emb_bert-large-uncased.csv")
)

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.6400104291438553
fold: 1 f1: 0.43391165944623095
fold: 1 precision: 0.43902307390896117
fold: 1 recall: 0.44245872141854403
fold: 1 accuracy: 0.5018115942028986



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.6112954320049145
fold: 2 f1: 0.42669365316785574
fold: 2 precision: 0.43251847144173877
fold: 2 recall: 0.4258377339508108
fold: 2 accuracy: 0.4945652173913043



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.6310706140932094
fold: 3 f1: 0.42256532657069695
fold: 3 precision: 0.4432461873638345
fold: 3 recall: 0.42841147273793273
fold: 3 accuracy: 0.5452898550724637



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.603694653313976
fold: 4 f1: 0.41200828157349906
fold: 4 precision: 0.4335337411317804
fold: 4 recall: 0.4093195827246759
fold: 4 accuracy: 0.5054347826086957



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.6578860783936875
fold: 5 f1: 0.4539149141352324
fold: 5 precision: 0.4521409813558548
fold: 5 recall: 0.46205960061081436
fold: 5 accuracy: 0.515426497277677



In [16]:
catboost_embeddings_like_features.append(
    train_emb_like_feature("df_emb_albert-base-v2.csv")
)

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.6077308378580177
fold: 1 f1: 0.4212697720079608
fold: 1 precision: 0.43080120937263794
fold: 1 recall: 0.4496991188238151
fold: 1 accuracy: 0.4438405797101449



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.6307857122984947
fold: 2 f1: 0.39467315028958866
fold: 2 precision: 0.3946649612979239
fold: 2 recall: 0.4143248464013438
fold: 2 accuracy: 0.42028985507246375



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.6037795256834645
fold: 3 f1: 0.39743591153740326
fold: 3 precision: 0.4003182176157594
fold: 3 recall: 0.4078100357887986
fold: 3 accuracy: 0.5235507246376812



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.5955299302372025
fold: 4 f1: 0.3814849771181386
fold: 4 precision: 0.3808351234821823
fold: 4 recall: 0.38474979412233284
fold: 4 accuracy: 0.4311594202898551



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.6418276959927279
fold: 5 f1: 0.4050720744918972
fold: 5 precision: 0.42758772785921967
fold: 5 recall: 0.45184867845862947
fold: 5 accuracy: 0.41560798548094374



In [17]:
catboost_embeddings_like_features.append(
    train_emb_like_feature("df_emb_albert-xxlarge-v2.csv")
)

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.614525534108263
fold: 1 f1: 0.4029880814587745
fold: 1 precision: 0.4139790250140914
fold: 1 recall: 0.4075662799537117
fold: 1 accuracy: 0.5



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.6010783674592666
fold: 2 f1: 0.42131329423364816
fold: 2 precision: 0.4693687720410889
fold: 2 recall: 0.4269912559110387
fold: 2 accuracy: 0.5471014492753623



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.5924389812215469
fold: 3 f1: 0.39389921556060775
fold: 3 precision: 0.41325733658708624
fold: 3 recall: 0.4020825314812227
fold: 3 accuracy: 0.5289855072463768



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.5814234019143444
fold: 4 f1: 0.38921129287001155
fold: 4 precision: 0.38909754169217675
fold: 4 recall: 0.3906285832250653
fold: 4 accuracy: 0.4746376811594203



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.5842833910405982
fold: 5 f1: 0.3982337719098652
fold: 5 precision: 0.4112459214378867
fold: 5 recall: 0.40393761791338784
fold: 5 accuracy: 0.49909255898366606



In [18]:
pd.concat(catboost_embeddings_like_features)

Unnamed: 0,roc_auc,f1,precision,recall,accuracy
df_emb_bert-base-uncased.csv,0.63725,0.433527,0.438691,0.438432,0.509978
df_emb_bert-large-uncased.csv,0.628791,0.429819,0.440092,0.433617,0.512506
df_emb_albert-base-v2.csv,0.615931,0.399987,0.406841,0.421686,0.44689
df_emb_albert-xxlarge-v2.csv,0.59475,0.401129,0.41939,0.406241,0.509963


Если мы используем эмбединки как фичи, то качество становится гораздо хуже, из предаствленных вариантов лучше себя показывает df_emb_bert-base-uncased

## Catboost + embeddings

In [19]:
catboost_embeddings = []

In [20]:
N_SPLITS = 5


def train_emb(emb_name: str) -> pd.DataFrame:
    df = pd.read_csv(f"../data/interim/{emb_name}", low_memory=False)
    X_train = df.iloc[train_index].drop("target", axis=1)
    y_train = df.loc[train_index, "target"]

    X_train["emb"] = X_train.parallel_apply(lambda x: torch.tensor(x), axis=1)

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    i = 0
    mean_score = {}
    for train_fold, val_fold in skf.split(X_train, y_train):
        cat = CatBoostClassifier(
            task_type="GPU",
            devices="0",
            iterations=10000,
            early_stopping_rounds=300,
            loss_function="MultiClass",
            l2_leaf_reg=11,
            auto_class_weights="Balanced",
            eval_metric="AUC",
            random_seed=RANDOM_SEED,
            verbose=0,
        )

        train_dataset = Pool(
            data=X_train.iloc[train_fold][["emb"]],
            label=y_train.iloc[train_fold],
            embedding_features=["emb"],
        )

        eval_dataset = Pool(
            data=X_train.iloc[val_fold][["emb"]],
            label=y_train.iloc[val_fold],
            embedding_features=["emb"],
        )

        fit_model = cat.fit(
            train_dataset, eval_set=eval_dataset, use_best_model=True, plot=False
        )

        roc_auc = roc_auc_score(
            y_train.iloc[val_fold],
            fit_model.predict_proba(X_train.iloc[val_fold][["emb"]]),
            average="macro",
            multi_class="ovo",
        )

        f1 = f1_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold][["emb"]]),
            average="macro",
        )

        precision = precision_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold][["emb"]]),
            average="macro",
        )

        recall = recall_score(
            y_train.iloc[val_fold],
            fit_model.predict(X_train.iloc[val_fold][["emb"]]),
            average="macro",
        )

        accuracy = accuracy_score(
            y_train.iloc[val_fold], fit_model.predict(X_train.iloc[val_fold][["emb"]])
        )

        for score, value_score in zip(
            ["roc_auc", "f1", "precision", "recall", "accuracy"],
            [roc_auc, f1, precision, recall, accuracy],
        ):
            mean_score[score] = mean_score.get(score, 0) + value_score / N_SPLITS
            print(f"fold: {i + 1} {score}: {value_score}")

        print()
        i += 1
    return pd.DataFrame(mean_score, index=[emb_name])

In [21]:
catboost_embeddings.append(train_emb("df_emb_bert-base-uncased.csv"))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.6225237926592964
fold: 1 f1: 0.3331914043988348
fold: 1 precision: 0.3922991719601889
fold: 1 recall: 0.43739641348735686
fold: 1 accuracy: 0.34601449275362317



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.6094749561442094
fold: 2 f1: 0.369132408548467
fold: 2 precision: 0.40891016436873356
fold: 2 recall: 0.45748375662724344
fold: 2 accuracy: 0.37318840579710144



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.6143964999070567
fold: 3 f1: 0.33976239548249804
fold: 3 precision: 0.42524000486085795
fold: 3 recall: 0.4445909073023531
fold: 3 accuracy: 0.3442028985507246



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.6123581575800006
fold: 4 f1: 0.35614791259044876
fold: 4 precision: 0.42498462354811145
fold: 4 recall: 0.4218211628030479
fold: 4 accuracy: 0.3496376811594203



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.6265239819654114
fold: 5 f1: 0.34093595529765747
fold: 5 precision: 0.4057210159797915
fold: 5 recall: 0.4113731908664667
fold: 5 accuracy: 0.3793103448275862



In [22]:
catboost_embeddings.append(train_emb("df_emb_bert-large-uncased.csv"))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.5808990643847368
fold: 1 f1: 0.33364909546291005
fold: 1 precision: 0.37087784343258795
fold: 1 recall: 0.39375334134881085
fold: 1 accuracy: 0.358695652173913



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.49122902986404515
fold: 2 f1: 0.25541062590242913
fold: 2 precision: 0.3320244965544111
fold: 2 recall: 0.3262606894964912
fold: 2 accuracy: 0.27717391304347827



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.6308935093661603
fold: 3 f1: 0.39733503016380706
fold: 3 precision: 0.42741983417040624
fold: 3 recall: 0.43266153390860146
fold: 3 accuracy: 0.44021739130434784



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.571827969294496
fold: 4 f1: 0.33535041555400796
fold: 4 precision: 0.3695696805452903
fold: 4 recall: 0.3728934095959562
fold: 4 accuracy: 0.34601449275362317



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.5805760802914204
fold: 5 f1: 0.366998140702131
fold: 5 precision: 0.41742770167427706
fold: 5 recall: 0.3847077374176114
fold: 5 accuracy: 0.40653357531760437



In [23]:
catboost_embeddings.append(train_emb("df_emb_albert-base-v2.csv"))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 1 roc_auc: 0.5331844905773023
fold: 1 f1: 0.2687988436310035
fold: 1 precision: 0.412758120916622
fold: 1 recall: 0.3205034562047391
fold: 1 accuracy: 0.302536231884058



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 2 roc_auc: 0.5769732849998533
fold: 2 f1: 0.34721796578663117
fold: 2 precision: 0.387513609480924
fold: 2 recall: 0.41691010108073595
fold: 2 accuracy: 0.3695652173913043



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 3 roc_auc: 0.6020786192721
fold: 3 f1: 0.36896674582327066
fold: 3 precision: 0.4214737590762683
fold: 3 recall: 0.4481633045035935
fold: 3 accuracy: 0.38405797101449274



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 4 roc_auc: 0.5921936274784791
fold: 4 f1: 0.3491007640555506
fold: 4 precision: 0.4049311863506637
fold: 4 recall: 0.39364819713992727
fold: 4 accuracy: 0.3967391304347826



Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


fold: 5 roc_auc: 0.5813164910866687
fold: 5 f1: 0.3871726063794944
fold: 5 precision: 0.3951728289262723
fold: 5 recall: 0.3992748225007127
fold: 5 accuracy: 0.4355716878402904



In [26]:
catboost_embeddings = pd.concat(catboost_embeddings)
catboost_embeddings

Unnamed: 0,roc_auc,f1,precision,recall,accuracy
df_emb_bert-base-uncased.csv,0.617055,0.347834,0.411431,0.434533,0.358471
df_emb_bert-large-uncased.csv,0.571085,0.337749,0.383464,0.382055,0.365727
df_emb_albert-base-v2.csv,0.577149,0.344251,0.40437,0.3957,0.377694


## Catboost + text_feateres + emb + optuna

In [29]:
train_index = y_train.index
test_index = y_test.index

In [32]:
df_text = pd.read_csv("../data/interim/df_preprocessed_text.csv")
df_text = df_text.iloc[train_index][["stop_stremm"]]

df_emb = pd.read_csv("../data/interim/df_emb_bert-base-uncased.csv")
emb_len = len(df_emb) - 1
df_emb = df_emb.iloc[train_index]

df = df_emb.join(df_text)
df["emb"] = df[[str(i) for i in range(768)]].parallel_apply(
    lambda x: torch.Tensor(x), axis=1
)

X_train_emb_text = df[["stop_stremm", "emb"]]
y_train_emb_text = df["target"]

In [33]:
def fit_catboost(param, train_data, valid_data):
    X_train, y_train = train_data[0], train_data[1]
    X_val, y_val = valid_data[0], valid_data[1]

    train_dataset = Pool(
        data=X_train,
        label=y_train,
        embedding_features=["emb"],
        text_features=["stop_stremm"],
    )

    eval_dataset = Pool(
        data=X_val,
        label=y_val,
        embedding_features=["emb"],
        text_features=["stop_stremm"],
    )

    clf = CatBoostClassifier(**param, thread_count=-1, random_seed=RANDOM_SEED)

    clf.fit(
        train_dataset,
        eval_set=eval_dataset,
        verbose=0,
        plot=False,
        early_stopping_rounds=300,
    )

    proba = clf.predict_proba(eval_dataset)
    pred = clf.predict(eval_dataset)

    roc_auc = roc_auc_score(
        y_val,
        proba,
        average="macro",
        multi_class="ovo",
    )

    f1 = f1_score(
        y_val,
        pred,
        average="macro",
    )

    precision = precision_score(
        y_val,
        pred,
        average="macro",
    )

    recall = recall_score(
        y_val,
        pred,
        average="macro",
    )

    accuracy = accuracy_score(
            y_val, pred
        )

    return {
        'model':clf,
        'roc_auc':roc_auc,
        'f1':f1,
        'precision':precision,
        'recall':recall,
        'accuracy':accuracy
    }

In [36]:
best_score = float('-inf')
best_models = None
mean_sore_list = []

def objective(trial):
    global best_score, best_models, mean_sore_list

    n_splits=5

    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.01),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "auto_class_weights": trial.suggest_categorical(
            "auto_class_weights", ["SqrtBalanced", "Balanced", "None"]
        ),
        "depth": trial.suggest_int("depth", 4, 9),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli"]
        ),
        "eval_metric": "TotalF1",
        "task_type": "GPU",
        "devices": "0",
        "iterations": 10000,
        "early_stopping_rounds": 300,
        "loss_function": "MultiClass",
        "verbose": 0,
        'border_count':254

    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    list_models = []
    list_roc_auc = []
    list_f1 = []
    list_precision = []
    list_recall = []
    list_accuracy = []
    mean_score = {}
    
    for train_idx, valid_idx in skf.split(X_train_emb_text, y_train_emb_text):
        train_data = X_train_emb_text.iloc[train_idx, :], y_train_emb_text.iloc[train_idx]
        valid_data = X_train_emb_text.iloc[valid_idx, :], y_train_emb_text.iloc[valid_idx]
        
        # Подаем trials для перебора
        result = fit_catboost(param, train_data, valid_data) # Определили выше
        list_models.append(result['model'])
        list_roc_auc.append(result['roc_auc'])
        list_f1.append(result['f1'])
        list_precision.append(result['precision'])
        list_recall.append(result['recall'])
        list_accuracy.append(result['accuracy'])
        
    for score, value_score in zip(
        ["roc_auc", "f1", "precision", "recall", "accuracy"],
        [list_roc_auc, list_f1, list_precision, list_recall, list_accuracy],
    ):
        mean_score[score] = np.mean(value_score)

    result = np.mean(list_f1)
    
    if best_score < result:
        best_score = result
        best_models = list_models
    mean_sore_list.append(param | mean_score)

    return result

In [37]:
study = optuna.create_study(direction="maximize")
study.optimize(objective,
               n_trials=100,
               show_progress_bar=True,
               gc_after_trial=True
               )

[I 2023-07-02 17:15:44,352] A new study created in memory with name: no-name-ee43aa1d-87d4-46c7-a3ba-6f91adea2fd7


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-07-02 17:22:05,123] Trial 0 finished with value: 0.48620627053160836 and parameters: {'learning_rate': 0.0027985505970652197, 'l2_leaf_reg': 30, 'auto_class_weights': 'Balanced', 'depth': 8, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8653666283902238}. Best is trial 0 with value: 0.48620627053160836.
[I 2023-07-02 17:28:29,246] Trial 1 finished with value: 0.4395744090388295 and parameters: {'learning_rate': 0.0019775336508847828, 'l2_leaf_reg': 10, 'auto_class_weights': 'Balanced', 'depth': 4, 'bootstrap_type': 'Bernoulli', 'subsample': 0.14280437288193418}. Best is trial 0 with value: 0.48620627053160836.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-07-02 17:34:45,599] Trial 2 finished with value: 0.4324442711492581 and parameters: {'learning_rate': 0.002933811477679544, 'l2_leaf_reg': 29, 'auto_class_weights': 'SqrtBalanced', 'depth': 5, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.403720051275682}. Best is trial 0 with value: 0.48620627053160836.
[I 2023-07-02 17:41:39,316] Trial 3 finished with value: 0.5283875275719636 and parameters: {'learning_rate': 0.008616477236770672, 'l2_leaf_reg': 24, 'auto_class_weights': 'SqrtBalanced', 'depth': 9, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.5278745322314726}. Best is trial 3 with value: 0.5283875275719636.
[I 2023-07-02 17:48:15,379] Trial 4 finished with value: 0.4675812708430125 and parameters: {'learning_rate': 0.0069412674308764345, 'l2_leaf_reg': 18, 'auto_class_weights': 'SqrtBalanced', 'depth': 8, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.6030318799457706}. Best is trial 3 with value: 0.5283875275719636.
[I 2023-07-02 17:54:53,611] Tr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-07-02 18:26:52,007] Trial 10 finished with value: 0.4259909795270601 and parameters: {'learning_rate': 0.00977871633278937, 'l2_leaf_reg': 50, 'auto_class_weights': 'None', 'depth': 6, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.01525582392222058}. Best is trial 3 with value: 0.5283875275719636.
[I 2023-07-02 18:33:14,196] Trial 11 finished with value: 0.517406313265353 and parameters: {'learning_rate': 0.008466578766146178, 'l2_leaf_reg': 40, 'auto_class_weights': 'SqrtBalanced', 'depth': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.5481307907970315}. Best is trial 3 with value: 0.5283875275719636.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-07-02 18:39:35,055] Trial 12 finished with value: 0.42221484755229965 and parameters: {'learning_rate': 0.008089461804326948, 'l2_leaf_reg': 39, 'auto_class_weights': 'SqrtBalanced', 'depth': 5, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.641676736912043}. Best is trial 3 with value: 0.5283875275719636.
[I 2023-07-02 18:45:57,172] Trial 13 finished with value: 0.5308047187750784 and parameters: {'learning_rate': 0.009911457627308193, 'l2_leaf_reg': 18, 'auto_class_weights': 'SqrtBalanced', 'depth': 7, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9327556873306312}. Best is trial 13 with value: 0.5308047187750784.
[I 2023-07-02 18:52:29,843] Trial 14 finished with value: 0.5270763605738733 and parameters: {'learning_rate': 0.009942189824389587, 'l2_leaf_reg': 2, 'auto_class_weights': 'SqrtBalanced', 'depth': 9, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9557307446856258}. Best is trial 13 with value: 0.5308047187750784.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-07-02 18:58:51,628] Trial 15 finished with value: 0.4164306197555547 and parameters: {'learning_rate': 0.00915903749329153, 'l2_leaf_reg': 18, 'auto_class_weights': 'None', 'depth': 7, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.6525622558152584}. Best is trial 13 with value: 0.5308047187750784.
[I 2023-07-02 19:05:15,630] Trial 16 finished with value: 0.5013780269865648 and parameters: {'learning_rate': 0.00736175782622169, 'l2_leaf_reg': 25, 'auto_class_weights': 'SqrtBalanced', 'depth': 7, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.7027216774214238}. Best is trial 13 with value: 0.5308047187750784.
[I 2023-07-02 19:11:42,504] Trial 17 finished with value: 0.5279457371484637 and parameters: {'learning_rate': 0.008704118819537166, 'l2_leaf_reg': 15, 'auto_class_weights': 'SqrtBalanced', 'depth': 9, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9081723932011774}. Best is trial 13 with value: 0.5308047187750784.


## BERT 

In [4]:
model_name = "bert-base-uncased" 

bert_classifier = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to('cuda:0')
bert_tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [5]:
class GroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


In [6]:
label2index = {name:index for index, name in enumerate(y_train.unique())}

In [7]:
y_train_index = y_train.map(label2index)
y_valid_index = y_test.map(label2index)

In [8]:
def make_datasets(X_train, X_val, y_train, y_val, columns):
    X_train = X_train[columns].tolist()
    X_val = X_val[columns].tolist()
    y_train = y_train.tolist()
    y_val = y_val.tolist()

    train_encodings = bert_tokenizer(X_train, truncation=True, padding=True)
    val_encodings = bert_tokenizer(X_val, truncation=True, padding=True)
    train_dataset = GroupsDataset(train_encodings, y_train)
    valid_dataset = GroupsDataset(val_encodings, y_val)
    return train_dataset, valid_dataset

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average='macro')
    return {
            'f1 macro': f1
            }

In [10]:
train_dataset, valid_dataset = make_datasets(X_train, X_test, y_train_index, y_valid_index, 'text')

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=5e-5,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=250,
    weight_decay=0.01,
    #logging_dir='./logs',
    load_best_model_at_end=True,
    logging_steps=250,
    save_steps=250,
    fp16=True,
    evaluation_strategy="steps")

trainer = Trainer(
    model=bert_classifier.to('cuda:0'),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Step,Training Loss,Validation Loss,F1 macro
250,0.9619,0.891217,0.397461
500,0.6377,1.353188,0.484726
750,0.2107,2.059117,0.473366


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


TrainOutput(global_step=870, training_loss=0.53196613048685, metrics={'train_runtime': 888.9212, 'train_samples_per_second': 31.038, 'train_steps_per_second': 0.979, 'total_flos': 7259299195115520.0, 'train_loss': 0.53196613048685, 'epoch': 10.0})