In [20]:
import pandas as pd
import torch
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=False)

RANDOM_SEED = 1

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [10]:
def train_model(emb_name:str):
    df = pd.read_csv(f"../data/interim/{emb_name}", low_memory=False)
    print(df.shape)
    X = df.drop("target", axis=1)
    y = df["target"]
    model = CatBoostClassifier(
        iterations=30000,
        verbose=False,
        task_type="GPU",
        devices="0",
        # boosting_type="Ordered",
        leaf_estimation_method="Newton",
        auto_class_weights="Balanced",
        eval_metric='Accuracy',
        early_stopping_rounds=500, 
        random_seed=RANDOM_SEED
    )

    param = {"l2_leaf_reg": [0.5, 1, 10], "depth": [6, 8, 9], "learning_rate": [0.03, 0.003]}

    grid_search = model.grid_search(
        param,
        X=X,
        y=y,
        train_size=0.8,
        refit=True,
        cv=5,
        calc_cv_statistics=True,
        verbose=False,
        plot=True,
    )
    return grid_search


In [11]:
df = pd.read_csv(f"../data/interim/df_emb_bert-base-uncased.csv", low_memory=False)
df.target.value_counts()

target
middle    1807
easy      1081
hard       561
Name: count, dtype: int64

In [12]:
base = train_model('df_emb_bert-base-uncased.csv')

(3449, 769)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.5085964856
bestIteration = 17
bestTest = 0.4877203446
bestIteration = 376
bestTest = 0.5058971986
bestIteration = 17
bestTest = 0.487718368
bestIteration = 86
bestTest = 0.4866933905
bestIteration = 16
bestTest = 0.4937948855
bestIteration = 65
bestTest = 0.465088013
bestIteration = 37
bestTest = 0.4635723698
bestIteration = 14
bestTest = 0.4680884521
bestIteration = 42
bestTest = 0.4768418868
bestIteration = 52
bestTest = 0.4869152094
bestIteration = 29
bestTest = 0.4978109378
bestIteration = 422
bestTest = 0.4602860727
bestIteration = 18
bestTest = 0.477709344
bestIteration = 40
bestTest = 0.4637446129
bestIteration = 5
bestTest = 0.4737315266
bestIteration = 18
bestTest = 0.4909743365
bestIteration = 20
bestTest = 0.4984888812
bestIteration = 55
Training on fold [0/5]
bestTest = 0.4704474518
bestIteration = 11
Training on fold [1/5]
bestTest = 0.4810749449
bestIteration = 123
Training on fold [2/5]
bestTest = 0.4576080183
bestIteration = 53
Training on fold [3/5]
bestTe

In [31]:
large = train_model('df_emb_bert-large-uncased.csv')


(3449, 1025)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.4530180671
bestIteration = 63
bestTest = 0.4381220224
bestIteration = 95
bestTest = 0.4336459355
bestIteration = 51
bestTest = 0.4478252153
bestIteration = 425
bestTest = 0.4204772386
bestIteration = 34
bestTest = 0.4260091424
bestIteration = 244
bestTest = 0.4123062331
bestIteration = 24
bestTest = 0.4320136654
bestIteration = 235
bestTest = 0.3949464073
bestIteration = 12
bestTest = 0.4211811799
bestIteration = 42
bestTest = 0.3948462805
bestIteration = 33
bestTest = 0.4213792596
bestIteration = 41
Training on fold [0/5]
bestTest = 0.4503714744
bestIteration = 62
Training on fold [1/5]
bestTest = 0.4321915009
bestIteration = 16
Training on fold [2/5]
bestTest = 0.4034390126
bestIteration = 215
Training on fold [3/5]
bestTest = 0.4634365006
bestIteration = 81
Training on fold [4/5]
bestTest = 0.4522573596
bestIteration = 65


In [32]:
albert_base = train_model('df_emb_albert-base-v2.csv')

(3449, 769)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.4621538919
bestIteration = 43
bestTest = 0.4523022184
bestIteration = 74
bestTest = 0.4577272897
bestIteration = 41
bestTest = 0.4531980622
bestIteration = 23
bestTest = 0.4437973607
bestIteration = 12
bestTest = 0.4551317073
bestIteration = 13
bestTest = 0.4531538148
bestIteration = 16
bestTest = 0.4534202544
bestIteration = 23
bestTest = 0.4257848697
bestIteration = 17
bestTest = 0.4387333952
bestIteration = 23
bestTest = 0.4289673544
bestIteration = 10
bestTest = 0.4453173155
bestIteration = 23
Training on fold [0/5]
bestTest = 0.4409221855
bestIteration = 67
Training on fold [1/5]
bestTest = 0.4431290802
bestIteration = 39
Training on fold [2/5]
bestTest = 0.4713674172
bestIteration = 57
Training on fold [3/5]
bestTest = 0.4431429293
bestIteration = 50
Training on fold [4/5]
bestTest = 0.4557552873
bestIteration = 107


In [33]:
albert_xxlarge = train_model('df_emb_albert-xxlarge-v2.csv')

(3449, 4097)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.4285550686
bestIteration = 18
bestTest = 0.4382061337
bestIteration = 32
bestTest = 0.4248071368
bestIteration = 18
bestTest = 0.4146908557
bestIteration = 33
bestTest = 0.4132280506
bestIteration = 8
bestTest = 0.4044943926
bestIteration = 16
bestTest = 0.4098773492
bestIteration = 17
bestTest = 0.4011835797
bestIteration = 17
bestTest = 0.3878069612
bestIteration = 7
bestTest = 0.3898405948
bestIteration = 26
bestTest = 0.4034910409
bestIteration = 7
bestTest = 0.3978894943
bestIteration = 26
Training on fold [0/5]
bestTest = 0.4427151737
bestIteration = 1003
Training on fold [1/5]
bestTest = 0.4126752995
bestIteration = 23
Training on fold [2/5]
bestTest = 0.4085781496
bestIteration = 102
Training on fold [3/5]
bestTest = 0.4377332806
bestIteration = 87
Training on fold [4/5]
bestTest = 0.4627405311
bestIteration = 112


## Используем эмбединги как эмбединги

In [13]:
df = pd.read_csv(f"../data/interim/df_emb_bert-base-uncased.csv", low_memory=False)
df.target.value_counts()

target
middle    1807
easy      1081
hard       561
Name: count, dtype: int64

In [21]:
df = pd.read_csv("../data/interim/df_emb_bert-base-uncased.csv", low_memory=False)
print(df.shape)
X = df.drop('target', axis=1)
X['emb'] = X.parallel_apply(lambda x: torch.tensor(x), axis=1)

(3449, 769)


In [22]:
def train_model(emb_name:str):
    df = pd.read_csv(f"../data/interim/{emb_name}", low_memory=False)
    X = df.drop('target', axis=1)
    X['emb'] = X.parallel_apply(lambda x: torch.tensor(x), axis=1)
    y = df["target"]

    data = Pool(data=X, 
                label=y, 
                embedding_features=['emb'])

    model = CatBoostClassifier(
        iterations=30000,
        verbose=False,
        task_type="GPU",
        devices=["0", "1"],
        # boosting_type="Ordered",
        leaf_estimation_method="Newton",
        auto_class_weights="Balanced",
        eval_metric='Accuracy',
        early_stopping_rounds=300, 
        random_seed=RANDOM_SEED
    )

    param = {"l2_leaf_reg": [0.5, 1, 10], "depth": [6, 8, 9], "learning_rate": [0.03, 0.003]}

    grid_search = model.grid_search(
        param,
        data,
        train_size=0.8,
        refit=True,
        cv=5,
        calc_cv_statistics=True,
        verbose=False,
        plot=True,
    )
    return grid_search

In [23]:
train_model('df_emb_bert-base-uncased.csv')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.4900162054
bestIteration = 100
bestTest = 0.4822518017
bestIteration = 82
bestTest = 0.4667170118
bestIteration = 28
bestTest = 0.4821285631
bestIteration = 154
bestTest = 0.4851309788
bestIteration = 103
bestTest = 0.482335088
bestIteration = 538
bestTest = 0.4563082554
bestIteration = 20
bestTest = 0.4796397542
bestIteration = 93
bestTest = 0.464014031
bestIteration = 24
bestTest = 0.4783402595
bestIteration = 81
bestTest = 0.4755596627
bestIteration = 83
bestTest = 0.4835246616
bestIteration = 318
bestTest = 0.4499059277
bestIteration = 55
bestTest = 0.470615496
bestIteration = 44
bestTest = 0.4596004823
bestIteration = 36
bestTest = 0.4837121987
bestIteration = 80
bestTest = 0.4851539197
bestIteration = 80
bestTest = 0.4901978127
bestIteration = 44
Training on fold [0/5]
bestTest = 0.4775716419
bestIteration = 92
Training on fold [1/5]
bestTest = 0.4498662626
bestIteration = 337
Training on fold [2/5]
bestTest = 0.4564628952
bestIteration = 180
Training on fold [3/5]
b

{'params': {'depth': 9, 'l2_leaf_reg': 10, 'learning_rate': 0.003},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
  