In [21]:
import numpy as np
import pandas as pd 

from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, auc
import catboost
import optuna
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from pandarallel import pandarallel

RANDOM_SEED = 121

nltk.download('punkt')
stop_words = stopwords.words('english')

pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package punkt to /home/gleb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Catboost + text_features

In [19]:
df = pd.read_csv('../data/interim/df_preprocessed_text.csv')
df.sample(1)

Unnamed: 0,name,file_name,num_file,level,text,stemmer,lemm,no_stop,stop_stremm,stop_lemm
1206,Lifeforce,Lifeforce.Eng.srt,False,middle,"August 9, 2:30 p.m.Greenwich Mean Time. The H....","august 9 , 2:30 p.m.greenwich mean time . the ...","August 9 , 2:30 p.m.Greenwich Mean Time . The ...","August 9 , 2:30 p.m.Greenwich Mean Time . The ...","august 9 , 2:30 p.m.greenwich mean time . the ...","August 9, 2:30 p.m.Greenwich Mean Time. The H...."


In [20]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=0)
y_train, X_train = train_df['level'], train_df.drop(['level'], axis=1)
y_test, X_test = test_df['level'], test_df.drop(['level'], axis=1)

In [None]:
N_SPLITS = 5

def train_cv(cols:str):
    kf = KFold(n_splits=N_SPLITS)
    cat_text_preds = pd.DataFrame(0, index=np.arange(y_test.shape[0]), columns=['label'])

    i = 0
    for train_fold, val_fold in kf.split(X_train):
        cat = CatBoostClassifier(task_type="GPU",
                            devices='0',
                            iterations=10000,
                            early_stopping_rounds=100,
                            loss_function='Logloss',
                            eval_metric="F1",
                            random_seed=RANDOM_SEED,
                            verbose = 0)
        
        train_dataset = Pool(data=X_train.iloc[train_fold][cols],
                        label=X_train.iloc[train_fold]['label'],
                        text_features=cols,
                        )

        eval_dataset = Pool(data=X_train.iloc[val_fold][cols],
                        label=X_train.iloc[val_fold]['label'],
                        text_features=cols,
                        )
        
        fit_model = cat.fit(train_dataset,
                            eval_set=eval_dataset,
                            use_best_model=True,
                            plot=False)

        cat_text_preds['label'] += fit_model.predict_proba(X_test[cols]).T[0] / N_SPLITS # probs of ai text

        f1_people = f1_score(X_train.iloc[val_fold]['label'], fit_model.predict(X_train.iloc[val_fold][cols]), pos_label=0)
        f1_ai = f1_score(X_train.iloc[val_fold]['label'], fit_model.predict(X_train.iloc[val_fold][cols]), pos_label=1)

        print(f'fold: {i + 1} f1 people score: {f1_people}')
        print(f'fold: {i + 1} f1 ai score: {f1_ai}')
        print()
        i += 1

In [15]:
def fit_model_cv(X_train, y_train, text_type):
    train_pool = Pool(data=X_train[[text_type]], label=y_train, text_features=[text_type])
    model = cv(
        pool=train_pool,
        params={'iterations':10000,
        'learning_rate':0.05, 
        'eval_metric':'TotalF1',
        'task_type':'GPU',
        'loss_function':'MultiClass', 
        'devices':[0, 1], 
        'auto_class_weights':'Balanced', 
        'l2_leaf_reg':11, 
        'random_seed':1        
        },
        return_models=True,
        stratified=True, 
        early_stopping_rounds=200,
        fold_count=5, 
        verbose=200

    )
    print("Лучшая метрика:", model[0]['test-TotalF1-mean'].max())
    print("Лучшая итерация:", np.argmax(model[0]['test-TotalF1-mean']))

In [16]:
fit_model_cv(X_train, y_train, 'text')

Training on fold [0/5]
0:	learn: 0.5099637	test: 0.4585734	best: 0.4585734 (0)	total: 16ms	remaining: 2m 40s
200:	learn: 0.8899296	test: 0.5686095	best: 0.5697143 (194)	total: 2.27s	remaining: 1m 50s
400:	learn: 0.9616037	test: 0.5501817	best: 0.5724553 (286)	total: 4.47s	remaining: 1m 47s
bestTest = 0.5724553017
bestIteration = 286
Training on fold [1/5]
0:	learn: 0.5453920	test: 0.5439223	best: 0.5439223 (0)	total: 17.1ms	remaining: 2m 51s
200:	learn: 0.8862911	test: 0.5654005	best: 0.5854475 (168)	total: 2.25s	remaining: 1m 49s
bestTest = 0.5854474593
bestIteration = 168
Training on fold [2/5]
0:	learn: 0.5190278	test: 0.4806253	best: 0.4806253 (0)	total: 17.5ms	remaining: 2m 54s
200:	learn: 0.8871758	test: 0.5308642	best: 0.5438507 (175)	total: 2.28s	remaining: 1m 51s
bestTest = 0.5438507278
bestIteration = 175
Training on fold [3/5]
0:	learn: 0.5297759	test: 0.5171473	best: 0.5171473 (0)	total: 18.4ms	remaining: 3m 3s
200:	learn: 0.8898671	test: 0.5434841	best: 0.5767220 (83)	tota

In [13]:
fit_model_cv(X_train, y_train, 'no_stop')

Training on fold [0/5]
0:	learn: 0.5539891	test: 0.5028208	best: 0.5028208 (0)	total: 18.5ms	remaining: 3m 4s
200:	learn: 0.8776290	test: 0.5369169	best: 0.5479881 (148)	total: 2.28s	remaining: 1m 51s
bestTest = 0.5479880834
bestIteration = 148
Training on fold [1/5]
0:	learn: 0.5536109	test: 0.5056391	best: 0.5056391 (0)	total: 17.5ms	remaining: 2m 54s
200:	learn: 0.8813434	test: 0.5646822	best: 0.5831916 (105)	total: 2.31s	remaining: 1m 52s
bestTest = 0.5831915905
bestIteration = 105
Training on fold [2/5]
0:	learn: 0.5452097	test: 0.4567324	best: 0.4567324 (0)	total: 17.3ms	remaining: 2m 52s
200:	learn: 0.8820308	test: 0.5255613	best: 0.5308645 (124)	total: 2.26s	remaining: 1m 50s
400:	learn: 0.9636607	test: 0.5468354	best: 0.5526283 (383)	total: 4.5s	remaining: 1m 47s
bestTest = 0.5526283272
bestIteration = 383
Training on fold [3/5]
0:	learn: 0.5237357	test: 0.5272818	best: 0.5272818 (0)	total: 16.8ms	remaining: 2m 48s
200:	learn: 0.8788244	test: 0.5516576	best: 0.5867393 (2)	tota

In [14]:
fit_model_cv(X_train, y_train, 'stemmer')

Training on fold [0/5]
0:	learn: 0.5771473	test: 0.4978142	best: 0.4978142 (0)	total: 18.3ms	remaining: 3m 2s
200:	learn: 0.8612893	test: 0.5209445	best: 0.5447671 (19)	total: 2.33s	remaining: 1m 53s
bestTest = 0.5447671046
bestIteration = 19
Training on fold [1/5]
0:	learn: 0.5489092	test: 0.5164864	best: 0.5164864 (0)	total: 18.3ms	remaining: 3m 3s
200:	learn: 0.8657672	test: 0.5386677	best: 0.5815575 (80)	total: 2.32s	remaining: 1m 53s
bestTest = 0.5815575184
bestIteration = 80
Training on fold [2/5]
0:	learn: 0.5410596	test: 0.4889278	best: 0.4889278 (0)	total: 18.5ms	remaining: 3m 5s
200:	learn: 0.8621097	test: 0.5233212	best: 0.5399565 (109)	total: 2.27s	remaining: 1m 50s
bestTest = 0.5399565447
bestIteration = 109
Training on fold [3/5]
0:	learn: 0.5592966	test: 0.5464583	best: 0.5464583 (0)	total: 17.3ms	remaining: 2m 52s
200:	learn: 0.8703126	test: 0.5601667	best: 0.6076001 (55)	total: 2.35s	remaining: 1m 54s
bestTest = 0.6076000659
bestIteration = 55
Training on fold [4/5]
0:

In [15]:
fit_model_cv(X_train, y_train, 'lemm')

Training on fold [0/5]
0:	learn: 0.5539891	test: 0.5028208	best: 0.5028208 (0)	total: 17.9ms	remaining: 2m 58s
200:	learn: 0.8776290	test: 0.5369169	best: 0.5479881 (148)	total: 2.37s	remaining: 1m 55s
bestTest = 0.5479880834
bestIteration = 148
Training on fold [1/5]
0:	learn: 0.5536109	test: 0.5056391	best: 0.5056391 (0)	total: 18.2ms	remaining: 3m 1s
200:	learn: 0.8813434	test: 0.5646822	best: 0.5831916 (105)	total: 2.25s	remaining: 1m 49s
bestTest = 0.5831915905
bestIteration = 105
Training on fold [2/5]
0:	learn: 0.5452097	test: 0.4567324	best: 0.4567324 (0)	total: 18.4ms	remaining: 3m 4s
200:	learn: 0.8820308	test: 0.5255613	best: 0.5308645 (124)	total: 2.27s	remaining: 1m 50s
400:	learn: 0.9636607	test: 0.5468354	best: 0.5526283 (383)	total: 4.46s	remaining: 1m 46s
bestTest = 0.5526283272
bestIteration = 383
Training on fold [3/5]
0:	learn: 0.5237357	test: 0.5272818	best: 0.5272818 (0)	total: 16.2ms	remaining: 2m 42s
200:	learn: 0.8788244	test: 0.5516576	best: 0.5867393 (2)	tota

In [16]:
fit_model_cv(X_train, y_train, 'stop_lemm')

Training on fold [0/5]
0:	learn: 0.5099637	test: 0.4585734	best: 0.4585734 (0)	total: 16ms	remaining: 2m 40s
200:	learn: 0.8899296	test: 0.5686095	best: 0.5697143 (194)	total: 2.3s	remaining: 1m 52s
400:	learn: 0.9616037	test: 0.5501817	best: 0.5724553 (286)	total: 4.51s	remaining: 1m 47s
bestTest = 0.5724553017
bestIteration = 286
Training on fold [1/5]
0:	learn: 0.5453920	test: 0.5439223	best: 0.5439223 (0)	total: 16ms	remaining: 2m 39s
200:	learn: 0.8862911	test: 0.5654005	best: 0.5854475 (168)	total: 2.23s	remaining: 1m 48s
bestTest = 0.5854474593
bestIteration = 168
Training on fold [2/5]
0:	learn: 0.5190278	test: 0.4806253	best: 0.4806253 (0)	total: 17.9ms	remaining: 2m 58s
200:	learn: 0.8871758	test: 0.5308642	best: 0.5438507 (175)	total: 2.28s	remaining: 1m 51s
bestTest = 0.5438507278
bestIteration = 175
Training on fold [3/5]
0:	learn: 0.5297759	test: 0.5171473	best: 0.5171473 (0)	total: 16.3ms	remaining: 2m 43s
200:	learn: 0.8898671	test: 0.5434841	best: 0.5767220 (83)	total: