In [2]:
import os
import pandas as pd
import numpy as np

import catboost
import optuna

RANDOM_SEED = 121

In [3]:
df = pd.read_csv('data/raw/imdb.csv')
df['label'] = (df['sentiment'] == 'positive').astype(int)
df.drop(['sentiment'], axis=1, inplace=True)
df.head()

Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
from catboost import Pool
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, train_size=0.8, random_state=0)
y_train, X_train = train_df['label'], train_df.drop(['label'], axis=1)
y_test, X_test = test_df['label'], test_df.drop(['label'], axis=1)

train_pool = Pool(data=X_train, label=y_train, text_features=['review'])
test_pool = Pool(data=X_test, label=y_test, text_features=['review'])

print('Train dataset shape: {}\n'.format(train_pool.shape))

Train dataset shape: (40000, 1)



In [5]:
from catboost import CatBoostClassifier

def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        eval_metric='AUC',
        **kwargs
    )

    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=100,
    )

model = fit_model(train_pool, test_pool, task_type='GPU')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.9086630	best: 0.9086630 (0)	total: 35.5ms	remaining: 35.4s
100:	test: 0.9391778	best: 0.9391778 (100)	total: 3.19s	remaining: 28.4s
200:	test: 0.9465489	best: 0.9465489 (200)	total: 6.26s	remaining: 24.9s
300:	test: 0.9502428	best: 0.9502428 (300)	total: 9.28s	remaining: 21.6s
400:	test: 0.9525813	best: 0.9525813 (400)	total: 13.2s	remaining: 19.7s
500:	test: 0.9541728	best: 0.9541728 (500)	total: 18.6s	remaining: 18.6s
600:	test: 0.9553502	best: 0.9553502 (600)	total: 24.1s	remaining: 16s
700:	test: 0.9562135	best: 0.9562135 (700)	total: 27.9s	remaining: 11.9s
800:	test: 0.9569889	best: 0.9569889 (800)	total: 31.2s	remaining: 7.75s
900:	test: 0.9576154	best: 0.9576154 (900)	total: 34.5s	remaining: 3.79s
999:	test: 0.9580704	best: 0.9580704 (999)	total: 38.4s	remaining: 0us
bestTest = 0.9580703974
bestIteration = 999


In [None]:
## убираем стоп слова 

In [None]:
df = pd.read_csv('../data/interim/df_text.csv')
df = df[['text', 'level']]

In [None]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=0)
y_train, X_train = train_df['level'], train_df.drop(['level'], axis=1)
y_test, X_test = test_df['level'], test_df.drop(['level'], axis=1)

train_pool = Pool(data=X_train, label=y_train, text_features=['text'])
test_pool = Pool(data=X_test, label=y_test, text_features=['text'])