In [2]:
! pip install catboost
! pip install gensim

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB

In [1]:
import numpy as np
import pandas as pd
import re
import torch
import numba as nb
from numba import njit, prange,jit
import cupy as cp
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

# Инициализация устройств
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

# Конфигурация
MAX_LEN = 128
BATCH_SIZE = 32

Используемое устройство: cuda


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Функция для анализа текста с CatBoost

def analyze_text_with_catboost(texts, vectorizer, model):
    X = vectorizer.transform(texts)
    results = np.empty(len(texts), dtype=np.float32)
    for i in prange(len(texts)):
        results[i] = model.predict_proba(X[i])[0][1]
    return results


# Загрузка данных
#@njit
def load_data(path):
    return pd.read_csv(path)

print("Загрузка данных...")
df = load_data('/content/drive/MyDrive/data/toxic_comments.csv')

# Явное приведение к строке

def convert_to_string(obj):

    return str(obj).encode('utf-8').decode('utf-8')

#df['text'] = convert_to_string(df['text'])

Загрузка данных...


In [4]:
# Предобработка текста

def preprocess_text_gpu(texts):
    processed = []
    for i in prange(len(texts)):
        text = str(texts[i]).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        processed.append(text)
    return processed

print("Предобработка текста...")
df['text_clean'] = preprocess_text_gpu(df['text'].values)
y = df['toxic'].values

Предобработка текста...


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,toxic,text_clean
0,0,Explanation\nWhy the edits made under my usern...,0,explanation\nwhy the edits made under my usern...
1,1,D'aww! He matches this background colour I'm s...,0,daww he matches this background colour im seem...
2,2,"Hey man, I'm really not trying to edit war. It...",0,hey man im really not trying to edit war its j...
3,3,"""\nMore\nI can't make any real suggestions on ...",0,\nmore\ni cant make any real suggestions on im...
4,4,"You, sir, are my hero. Any chance you remember...",0,you sir are my hero any chance you remember wh...


In [None]:
# 1. TF-IDF Токенизация
print("TF-IDF токенизация...")
tfidf = TfidfVectorizer(max_features=5000,ngram_range=(1, 5),stop_words= 'english')
X_tfidf = tfidf.fit_transform(df['text_clean'])
X_tfidf


TF-IDF токенизация...


In [None]:
# 2. Word2Vec Токенизация

def tokenize_texts(texts):
    tokens = []
    for i in prange(len(texts)):
        tokens.append(simple_preprocess(str(texts[i]), deacc=True))
    return tokens

print("Word2Vec токенизация...")
sentences = tokenize_texts(df['text_clean'].values)

print("Обучение Word2Vec модели...")
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


def text_to_vector_gpu(texts, model):
    vectors = np.empty((len(texts), model.vector_size), dtype=np.float32)
    for i in prange(len(texts)):
        words = simple_preprocess(str(texts[i]), deacc=True)
        word_vecs = [model.wv[word] for word in words if word in model.wv]
        vectors[i] = np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size)
    return vectors

print("Создание Word2Vec векторов...")
X_w2v = text_to_vector_gpu(df['text_clean'].values, w2v_model)

In [None]:
# 3. BERT Токенизация и получение эмбеддингов
print("Инициализация BERT...")
bert_tokenizer = BertTokenizer.from_pretrained('unitary/toxic-bert') #('bert-base-uncased')
bert_model = BertModel.from_pretrained(('unitary/toxic-bert')).to(device)


def bert_tokenize(texts):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = bert_tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

print("BERT токенизация...")
input_ids, attention_masks = bert_tokenize(df['text_clean'].tolist())


def get_bert_embeddings(input_ids, attention_masks):
    bert_model.eval()
    embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(input_ids), BATCH_SIZE)):
            batch_ids = input_ids[i:i+BATCH_SIZE].to(device)
            batch_masks = attention_masks[i:i+BATCH_SIZE].to(device)

            outputs = bert_model(batch_ids, attention_mask=batch_masks)
            embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())

    return np.concatenate(embeddings, axis=0)

print("Получение BERT эмбеддингов...")
X_bert = get_bert_embeddings(input_ids, attention_masks)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Модели с GPU поддержкой
models = {
    'CatBoost': CatBoostClassifier(task_type='GPU', iterations=500, verbose=0),
    'XGBoost': XGBClassifier(tree_method='gpu_hist', gpu_id=0),
    'LightGBM': LGBMClassifier(device='gpu')
}

# Кросс-валидация
def evaluate_model(X, y, model, n_splits=3):
    scores = []
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    fold_size = X.shape[0] // n_splits

    for i in range(n_splits):
        test_idx = indices[i*fold_size:(i+1)*fold_size]
        train_idx = np.concatenate([indices[:i*fold_size], indices[(i+1)*fold_size:]])

        model.fit(X[train_idx], y[train_idx])
        y_pred = model.predict(X[test_idx])
        scores.append(f1_score(y[test_idx], y_pred))

    return np.mean(scores), np.std(scores)


In [None]:

# Оценка всех комбинаций
results = []

print("\nОценка моделей:")
for name, model in models.items():
    # TF-IDF
    mean_f1, std_f1 = evaluate_model(X_tfidf, y, model)
    results.append({'Модель': name, 'Токенизация': 'TF-IDF', 'F1': mean_f1, 'Std': std_f1})

    # Word2Vec
    mean_f1, std_f1 = evaluate_model(X_w2v, y, model)
    results.append({'Модель': name, 'Токенизация': 'Word2Vec', 'F1': mean_f1, 'Std': std_f1})

    # BERT
    mean_f1, std_f1 = evaluate_model(X_bert, y, model)
    results.append({'Модель': name, 'Токенизация': 'BERT', 'F1': mean_f1, 'Std': std_f1})

# Результаты
results_df = pd.DataFrame(results)
print("\nРезультаты кросс-валидации:")
print(results_df)

# Анализ с CatBoost
print("\nАнализ текста с CatBoost...")
best_model = CatBoostClassifier(task_type='GPU', iterations=500, verbose=0)
best_model.fit(X_tfidf, y)
text_analysis = analyze_text_with_catboost(df['text_clean'].values, tfidf, best_model)

# Визуализация
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plt.hist(text_analysis, bins=50, alpha=0.7)
plt.title('Распределение вероятностей токсичности')
plt.xlabel('Вероятность токсичности')
plt.ylabel('Количество комментариев')

plt.subplot(1, 2, 2)
results_df.groupby(['Модель', 'Токенизация'])['F1'].mean().unstack().plot(kind='bar')
plt.title('Сравнение методов токенизации')
plt.ylabel('F1 Score')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()


Оценка моделей:




In [None]:
# CNN модель
def build_cnn_model(vocab_size=5000, max_len=100):
    model = Sequential([
        Embedding(vocab_size, 64, input_length=max_len),
        Conv1D(64, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Кросс-валидация
def evaluate_models(X, y, vectorizers, models):
    results = []
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    for vec_name, vec in vectorizers.items():
        for model_name, model in models.items():
            pipeline = Pipeline([('vectorizer', vec), ('model', model)])
            scores = cross_val_score(pipeline, X, y, cv=skf, scoring='f1', n_jobs=-1)
            results.append({
                'vectorizer': vec_name,
                'model': model_name,
                'f1_mean': np.mean(scores),
                'f1_std': np.std(scores)
            })

    return pd.DataFrame(results)

In [None]:
# Оценка для каждого типа токенизации
results_simple = evaluate_models(X['text_simple'], y, vectorizers, models)
results_lemmatized = evaluate_models(X['text_lemmatized'], y, vectorizers, models)
results_pos = evaluate_models(X['text_pos'], y, vectorizers, models)
torch.cuda.empty_cache()
# Объединение результатов
results_all = pd.concat([
    results_simple.assign(tokenization='simple'),
    results_lemmatized.assign(tokenization='lemmatized'),
    results_pos.assign(tokenization='pos_tags')
])

# CNN оценка
def evaluate_cnn(X_texts, y, tokenization_type):
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(X_texts)
    X_seq = tokenizer.texts_to_sequences(X_texts)
    X_pad = pad_sequences(X_seq, maxlen=100)

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, test_idx in skf.split(X_pad, y):
        model = build_cnn_model()
        model.fit(X_pad[train_idx], y[train_idx], epochs=3, batch_size=64, verbose=0)
        y_pred = (model.predict(X_pad[test_idx]) > 0.5).astype(int)
        f1_scores.append(f1_score(y[test_idx], y_pred))

    return {
        'vectorizer': 'embedding',
        'model': 'CNN',
        'f1_mean': np.mean(f1_scores),
        'f1_std': np.std(f1_scores),
        'tokenization': tokenization_type
    }

# Добавляем CNN результаты
cnn_results = [
    evaluate_cnn(X['text_simple'], y, 'simple'),
    evaluate_cnn(X['text_lemmatized'], y, 'lemmatized'),
    evaluate_cnn(X['text_pos'], y, 'pos_tags')
]

results_all = pd.concat([results_all, pd.DataFrame(cnn_results)])

In [None]:
# Анализ результатов
print("Лучшие комбинации по типу токенизации:")
print(results_all.groupby('tokenization').apply(lambda x: x.nlargest(3, 'f1_mean')))

print("\nЛучшие комбинации по модели:")
print(results_all.groupby('model').apply(lambda x: x.nlargest(3, 'f1_mean')))

print("\nЛучшие комбинации по векторйзеру:")
print(results_all.groupby('vectorizer').apply(lambda x: x.nlargest(3, 'f1_mean')))

# Визуализация
plt.figure(figsize=(15, 8))
for token_type in results_all['tokenization'].unique():
    subset = results_all[results_all['tokenization'] == token_type]
    plt.errorbar(subset['model'], subset['f1_mean'], yerr=subset['f1_std'],
                fmt='o', label=token_type, capsize=5)
plt.xticks(rotation=45)
plt.ylabel('F1 Score')
plt.title('Сравнение моделей по типам токенизации')
plt.legend()
plt.tight_layout()
plt.show()