In [None]:
!pip install razdel
!pip install nltk
!pip install pymorphy2

In [None]:
import nltk

nltk.download('stopwords')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import re
from tqdm import tqdm
import matplotlib.pyplot as plt

path = '/kaggle/input/otbrniy/train.csv/train.csv'

train_data = pd.read_csv(path)

**Часть 1. Предобработка текста и решение при помощи бустинга**

In [None]:
train_data.head()

Удалим из текста все символы, которые не являются буквами русского или английсского алфавитов. А также заменим последовательности пробелов на один пробел.

In [None]:
r = re.compile('[^A-Za-zА-Яа-я ]')

cleaned_text = [re.sub(r, '', text.lower()) for text in train_data['title']]

r = re.compile(' +')
cleaned_text = [re.sub(r, ' ', text) for text in cleaned_text]

train_data['title'] = cleaned_text

In [None]:
len(train_data)

In [None]:
len(train_data[train_data['title'] == ' '])

232 строки состоящих только из пробелов

In [None]:
len(train_data[train_data['title'] == ''])

552 пустых строки

In [None]:
y = []
for i in range(5, 35, 5):
    y.append(len(train_data.loc[train_data['title'].str.len() < i]))

x = [i for i in range(len(y))]

plt.bar(x, y)
plt.xticks(y)
plt.plot()

Удалим все строки, у которых длина title < 10 символов, включая пустые строки и строки состоящие только из пробелов.

In [None]:
train_data = train_data.loc[train_data['title'].str.len() > 10]

28188 текстов содержат в себе текст не на русском языке. (возможно стоит их перевести на русский, но тяжело будет выделить все языки, которые есть в датасете, как минимум еще видел записи на украинском)

In [None]:
import re

r = re.compile('[^А-Яа-я]')

english = train_data[train_data['title'].isin(filter(r.match, train_data['title']))]

len(english)

In [None]:
english

Проверим разбиение по классам, среди текстов с иностранными словами.

In [None]:
import matplotlib.pyplot as plt

label = [0, 1]

lbls = english['label'].unique()

cntr = []
for label in lbls:
    cntr.append(len(english[english['label'] == label]))

plt.bar(lbls, cntr)
plt.xticks(lbls)
plt.plot()

Попробуем перевести все тексты с английского на Русский без выделения именованных сущностей (названия компаний, имена, фамилии)

**Выполним токенизацию**

Токенизируем наши предложения по словам, удалим стоп-слова и приведем слова в начальную форму.

In [None]:
from razdel import tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

stopwords.words('english')

r_russian = re.compile('[А-Яа-я]')
r_english = re.compile('[A-Za-z]')


tokenize_texts = []
for item in tqdm(train_data['title']):

    t_tokens = [_.text for _ in tokenize(item)]

    tokens = []

    for token in t_tokens:
        if (token not in stopwords.words('russian')) and (
            token not in stopwords.words('english')):

            if (re.match(r_russian, token)) or (re.match(r_english, token)):
                tokens.append(morph.parse(token)[0].normal_form)

    tokenize_texts.append(tokens)

In [None]:
train_data['title'] = tokenize_texts

In [None]:
train_data

In [None]:
len(train_data)

In [None]:
train_data[train_data['title'].str.len() < 5]

In [None]:
train_data

25000 записей имеют длину title менее 5 слов, пока что фильтровать не будем, так как для определения порнографического контента в теории может быть достаточно 1-2 слов.

Сплитанем url по точкам, чтобы получить слова или что-то похожее на них. Переводить смысла нет, так как много ссылок выглядят как русские слова написанные транслитом.

In [None]:
splitted_url = [url.split('.') for url in train_data['url']]
train_data['url'] = splitted_url

In [None]:
from sklearn.model_selection import train_test_split

y = train_data['label']
X = train_data.drop(['label', 'ID'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
                                                    random_state=42, stratify=y)

In [None]:
train_data[train_data['label'] == 1]

In [None]:
train_data[train_data['label'] == 0]

Наблюдается сильный дизбаланс классов. Примеров 1 класса в 7 раз меньше, чем 0 класса. Попробуем использовать CatBoost в данной ситуации, а при разделении X, y на трейн и тест будем использовать стратификацию, чтобы сохранить отношение количества классов в каждой из них

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10, max_df=0.5)

X_title = [' '.join(title) for title in X_train['title']]

X_title_vectorized = vectorizer.fit_transform(X_title)

X_title_vectorized

In [None]:
!pip install catboost

In [None]:
X_title_eval = [' '.join(title) for title in X_test['title']]

X_title_test = vectorizer.transform(X_title_eval)

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(use_best_model=False,
                           depth=6,
                           iterations=3000)

model.fit(X_title_vectorized, y_train)

In [None]:
from catboost.utils import get_roc_curve
import sklearn
from sklearn import metrics
from catboost import Pool

eval_pool = Pool(X_title_test, y_test)
curve = get_roc_curve(model, eval_pool)
(fpr, tpr, thresholds) = curve
roc_auc = sklearn.metrics.auc(fpr, tpr)

In [None]:
roc_auc

In [None]:
from sklearn.metrics import classification_report

pred = model.predict(X_title_test)

report = classification_report(y_test, pred)
print(report)

In [None]:
path = '/kaggle/input/otbrniy/test.csv/test.csv'

test_data = pd.read_csv(path)

In [None]:
r = re.compile('[^A-Za-zА-Яа-я ]')

cleaned_text = [re.sub(r, '', text.lower()) for text in test_data['title']]

r = re.compile(' +')
cleaned_text = [re.sub(r, ' ', text) for text in cleaned_text]

test_data['title'] = cleaned_text

morph = pymorphy2.MorphAnalyzer()

stopwords.words('english')

r_russian = re.compile('[А-Яа-я]')
r_english = re.compile('[A-Za-z]')


tokenize_texts = []
for item in tqdm(test_data['title']):

    t_tokens = [_.text for _ in tokenize(item)]

    tokens = []

    for token in t_tokens:
        if (token not in stopwords.words('russian')) and (
            token not in stopwords.words('english')):

            if (re.match(r_russian, token)) or (re.match(r_english, token)):
                tokens.append(morph.parse(token)[0].normal_form)

    tokenize_texts.append(tokens)

test_data['title'] = tokenize_texts

X_title_test = [' '.join(title) for title in test_data['title']]

X_title_test = vectorizer.transform(X_title_test)

pred = model.predict(X_title_test)

test_data['label'] = pred

test_data[['ID', 'label']].to_csv('ml_kirichenko_catboost.csv', index=False)

In [None]:
X_title_test = [' '.join(title) for title in test_data['title']]

X_title_test = vectorizer.transform(X_title_test)

pred = model.predict(X_title_test)

test_data['label'] = pred

test_data[['ID', 'label']].to_csv('ml_kirichenko_catboost.csv', index=False)

Попробуем подобрать параметры через GridSearch для CatBoost

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier()

# model.fit(X_title_vectorized, y_train)

grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6],
        'l2_leaf_reg': [1, 3, 5]}

grid_search_result = model.grid_search(grid,
                                       X=X_title_vectorized,
                                       y=y_train,
                                       plot=True)

In [None]:
grid_search_result

Посчитаем самые популярные слова для 1-й категории в датасете

In [None]:
X_train[X_train['label'] == 1]['title'].value_counts()

**Решение через BERT**

In [None]:
!pip install transformers
!pip install datasets

In [None]:
!git clone https://huggingface.co/DeepPavlov/rubert-base-cased-sentence

In [None]:
import pandas as pd
import numpy as np
import random
import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_metric, Dataset
from sklearn.metrics import classification_report, f1_score
from torch.utils.data import Dataset
import torch

model = BertForSequenceClassification.from_pretrained('/kaggle/working/rubert-base-cased-sentence', num_labels=2).to("cuda")
tokenizer = BertTokenizer.from_pretrained('/kaggle/working/rubert-base-cased-sentence')

In [None]:
path = '/kaggle/input/otbrniy/train.csv/train.csv'

train_data = pd.read_csv(path)

In [None]:
splitted_url = [url.split('.') for url in train_data['url']]
train_data['url'] = splitted_url

In [None]:
def get_marked_text(train_data):
    marked_text = []
    for index, row in train_data.iterrows():
        words_in_url = []
        for word in row['url']:
            if len(word) > 3:
                words_in_url.append(word)
        
        marked_text.append('[sep]' + ' '.join(words_in_url) + '[sep]'
                           + str(row['title']))

    return marked_text

In [None]:
from sklearn.model_selection import train_test_split

y = train_data['label']
X = train_data.drop(['label', 'ID'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
                                                    random_state=42, stratify=y)

X_train['title'] = get_marked_text(X_train)
X_test['title'] = get_marked_text(X_test)
X_train.drop(['url'], axis=1)
X_test.drop(['url'], axis=1)

In [None]:
train_text = X_train['title'].to_list()

In [None]:
test_text = X_test['title'].to_list()

In [None]:
y_train = y_train.to_list()
y_test = y_test.to_list()

In [None]:
max_seq_len = 512

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_text,
    max_length = max_seq_len,
    padding = 'max_length',
    truncation = True
)
tokens_test = tokenizer.batch_encode_plus(
    test_text,
    max_length = max_seq_len,
    padding = 'max_length',
    truncation = True
)

In [None]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = Data(tokens_train, y_train)
test_dataset = Data(tokens_test, y_test)

In [None]:
from sklearn.metrics import f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    return {'F1': f1}
# metric = load_metric('precision', average='weighted')
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [None]:
!pip install transformers[torch]

In [None]:
import accelerate
accelerate.__version__

In [None]:
training_args = TrainingArguments(
    output_dir = './results', #Выходной каталог
    num_train_epochs = 2, #Кол-во эпох для обучения
    per_device_train_batch_size = 8, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size = 8, #Размер пакета для каждого устройства во время валидации
    weight_decay = 0.01, #Понижение весов
    logging_dir = './logs', #Каталог для хранения журналов
    load_best_model_at_end = True, #Загружать ли лучшую модель после обучения
    learning_rate = 1e-5, #Скорость обучения
    evaluation_strategy ='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy = 'epoch', #Логирование после каждой эпохи
    save_strategy = 'epoch', #Сохранение после каждой эпохи
    save_total_limit = 1,
    seed=21)

In [None]:
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = train_dataset,
                  eval_dataset = train_dataset,
                  compute_metrics = compute_metrics)

In [None]:
y_train.iloc[11318]

In [None]:
trainer.train()

In [None]:
!zip -r file.zip /kaggle/working/results

In [None]:
!zip -r rubert-base-cased-sentence /kaggle/working/rubert-base-cased-sentence

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/results/checkpoint-22666")
tokenizer = tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/results/checkpoint-22666")

In [None]:
from tqdm import tqdm
with torch.no_grad():
    variance = []
    for text in tqdm(test_text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
        variance.append(proba.dot([0, 1]))


In [None]:
pred = []

for item in variance:
    if item > 0.7:
        pred.append(1)
    else:
        pred.append(0)

test_data['label'] = pred

test_data[['ID', 'label']].to_csv('ml_kirichenko_catboost.csv', index=False)

**RubertTiny**

In [None]:
!pip install transformers

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=512):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
!git clone https://huggingface.co/cointegrated/rubert-tiny2

In [None]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
tokenizer_path = 'cointegrated/rubert-tiny'
model_path = 'cointegrated/rubert-tiny'

In [None]:
from torch.utils.data DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score

In [None]:
class BertClassifier:

    def __init__(self, model_path, tokenizer_path, n_classes=2, epochs=1, model_save_path='/content/bert.pt'):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_save_path=model_save_path
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model.to(self.device)
        
    def preparation(self, X_train, y_train, X_valid, y_valid):
        # create datasets
        self.train_set = CustomDataset(X_train, y_train, self.tokenizer)
        self.valid_set = CustomDataset(X_valid, y_valid, self.tokenizer)

        # create data loaders
        self.train_loader = DataLoader(self.train_set, batch_size=2, shuffle=True)
        self.valid_loader = DataLoader(self.valid_set, batch_size=2, shuffle=True)

        # helpers initialization
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_loader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)
        
    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0

        for data in self.train_loader:
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)

            losses.append(loss.item())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_f1 = f1_score(self.train_set, correct_predictions.double())
        train_loss = np.mean(losses)
        return train_f1, train_loss
    
    def eval(self):
        self.model = self.model.eval()
        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for data in self.valid_loader:
                input_ids = data["input_ids"].to(self.device)
                attention_mask = data["attention_mask"].to(self.device)
                targets = data["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = self.loss_fn(outputs.logits, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())

        val_f1 = f1_score(self.valid_set, correct_predictions.double())
        val_loss = np.mean(losses)
        return val_f1, val_loss
    
    def train(self):
        best_accuracy = 0
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            train_f1, train_loss = self.fit()
            print(f'Train loss {train_loss} f1 {train_acc}')

            val_f1, val_loss = self.eval()
            print(f'Val loss {val_loss} f1 {val_f1}')
            print('-' * 10)

            if val_acc > best_accuracy:
                torch.save(self.model, self.model_save_path)
                best_accuracy = val_f1

        self.model = torch.load(self.model_save_path)
    
    def predict(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        out = {
              'text': text,
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }

        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)

        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )

        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction