In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/spamDestroyer2

/content/drive/MyDrive/spamDestroyer2


In [None]:
import pandas as pd

df = pd.read_csv('df.csv')
df.head()

Unnamed: 0,text,is_spam
0,Идите до Джуронг-Пойнт с ума сойти Доступно т...,0
1,Ладно лар Шучу с тобой чувак,0
2,Ты не можешь так рано говорить хо Ты можешь уж...,0
3,Нет я не думаю что он учится в USF он живет гд...,0
4,Привет дорогая прошло уже 3 недели а ответа н...,1


In [None]:
df = df.dropna()

In [None]:
%%capture
!pip install --upgrade transformers

In [None]:
### Разделим данные на train, val, test 70% 20% 10%
from sklearn.model_selection import train_test_split

X = df['text']
y = df['is_spam']
X_train, x_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.30, random_state=8)

X_val, X_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size=0.33, random_state=8)

In [None]:
from transformers import AutoModel, AutoTokenizer, BertForSequenceClassification

# Путь к локальной папке с моделью
model_path = "bert-base-multilingual-cased"

# Загрузка модели и токенизатора
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        if tokenizer:
            encoding = self.tokenizer(
                text,
                max_length=self.max_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }

train_dataset = SpamDataset(X_train.reset_index()['text'], y_train.reset_index()['is_spam'], tokenizer)
val_dataset = SpamDataset(X_val.reset_index()['text'], y_val.reset_index()['is_spam'], tokenizer)
test_dataset = SpamDataset(X_test.reset_index()['text'], y_test.reset_index()['is_spam'], tokenizer)

In [None]:
from torch.utils.data import DataLoader

# Параметры
batch_size = 64

# Создание DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [None]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Оптимизатор
optimizer = AdamW(model.parameters(), lr=2e-5)

# Функция потерь
loss_fn = CrossEntropyLoss()

In [None]:
device ='cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)
model.to(device)

cuda:0


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
save_dir = 'saveModels'

In [None]:
# Цикл обучения с валидацией
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score

import os

epochs = 10
save_interval = 2


for epoch in range(epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    # Обучение
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} [Train]", unit="batch")
    for batch in train_loader_tqdm:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

        total_loss += loss.item()
        train_loader_tqdm.set_postfix(loss=loss.item())

    # Метрики на обучающем наборе
    train_accuracy = accuracy_score(all_labels, all_preds)
    train_precision = precision_score(all_labels, all_preds)
    train_recall = recall_score(all_labels, all_preds)

    # Валидация
    model.eval()
    val_preds = []
    val_labels = []
    with torch.no_grad():
        val_loader_tqdm = tqdm(val_loader, desc=f"Epoch {epoch + 1}/{epochs} [Val]", unit="batch")
        for batch in val_loader_tqdm:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.cpu().numpy())

    # Метрики на валидационном наборе
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_precision = precision_score(val_labels, val_preds)
    val_recall = recall_score(val_labels, val_preds)

    # Вывод метрик
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {total_loss / len(train_loader)}")
    print(f"Train Accuracy: {train_accuracy:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}")
    print(f"Val Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}")

    if (epoch + 1) % save_interval == 0:
        save_path = os.path.join(save_dir, f"model_epoch_{epoch + 1}.pt")
        torch.save(model.state_dict(), save_path)
        print(f"Модель сохранена в {save_path}")

Epoch 1/10 [Train]: 100%|██████████| 724/724 [24:52<00:00,  2.06s/batch, loss=0.00655]
Epoch 1/10 [Val]: 100%|██████████| 208/208 [04:09<00:00,  1.20s/batch]


Epoch 1/10, Train Loss: 0.11596203173410394
Train Accuracy: 0.9559, Precision: 0.9511, Recall: 0.9105
Val Accuracy: 0.9828, Precision: 0.9783, Recall: 0.9694


Epoch 2/10 [Train]: 100%|██████████| 724/724 [25:18<00:00,  2.10s/batch, loss=0.00824]
Epoch 2/10 [Val]: 100%|██████████| 208/208 [04:08<00:00,  1.19s/batch]


Epoch 2/10, Train Loss: 0.03769610985278703
Train Accuracy: 0.9877, Precision: 0.9828, Recall: 0.9790
Val Accuracy: 0.9831, Precision: 0.9894, Recall: 0.9591
Модель сохранена в saveModels/model_epoch_2.pt


Epoch 3/10 [Train]: 100%|██████████| 724/724 [25:10<00:00,  2.09s/batch, loss=0.611]
Epoch 3/10 [Val]: 100%|██████████| 208/208 [04:07<00:00,  1.19s/batch]


Epoch 3/10, Train Loss: 0.018897813359860433
Train Accuracy: 0.9944, Precision: 0.9929, Recall: 0.9898
Val Accuracy: 0.9863, Precision: 0.9857, Recall: 0.9726


Epoch 4/10 [Train]:  48%|████▊     | 347/724 [12:02<13:40,  2.18s/batch, loss=0.0363]