In [1]:
!pip install pymorphy2 
!pip install datasets

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [2]:
import wandb

wandb.login(key='b123af3ff1bc7e54569d0976c6405a5b3b6d2902')

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from transformers import BertTokenizer, BertModel

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words("russian")

import re

import pymorphy2
morph = pymorphy2.MorphAnalyzer()

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import wandb
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

from tqdm import tqdm

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def clean_text(text):
    # Remove URLs, numbers, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # URLs
    text = re.sub(r'\d+', '', text)  # Numbers
    text = re.sub(r'[^\w\s]', '', text)  # Special characters
    return text

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    # remove stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # lemmatize
    def lemmatize(tokens):
        return [morph.parse(word)[0].normal_form for word in tokens]
    tokens = lemmatize(tokens)
    return ' '.join(tokens)

def all_preprocessing(df):
    df['text'] = df['text'].apply(clean_text)
    df['text'] = df['text'].apply(preprocess_text)
    return df

In [5]:
df = pd.read_csv('/kaggle/input/wb_winter_24/train.csv', index_col='ID')
df = all_preprocessing(df)
df.head()

Unnamed: 0_level_0,text,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,брюки отличный качественный сожаление коротко ...,0
1,отличный аппарат комплект кабель работетта сет...,0
2,супер спасибо большой,0
3,получить быстро дать наушник наушник прийти пр...,0
4,всё дойти целость сохранность,0


In [6]:
# Define the CNN model
class CNNModel(nn.Module):
    def __init__(self, embedding_dim, num_filters=100, kernel_sizes=[3, 4, 5], num_classes=1):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, num_filters, (kernel_sizes[0], embedding_dim))
        self.conv2 = nn.Conv2d(1, num_filters, (kernel_sizes[1], embedding_dim))
        self.conv3 = nn.Conv2d(1, num_filters, (kernel_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension for CNN (batch_size, 1, seq_len, embedding_dim)
        conv1_out = torch.relu(self.conv1(x)).squeeze(3)
        conv2_out = torch.relu(self.conv2(x)).squeeze(3)
        conv3_out = torch.relu(self.conv3(x)).squeeze(3)

        pooled1 = torch.max(conv1_out, dim=2)[0]
        pooled2 = torch.max(conv2_out, dim=2)[0]
        pooled3 = torch.max(conv3_out, dim=2)[0]

        out = torch.cat((pooled1, pooled2, pooled3), dim=1)
        out = self.fc(out)
        out = self.sigmoid(out)
        return out


class Trainer:
    def __init__(self, model, train_loader, test_loader, loss_fn, optimizer, epochs=3, device='cpu'):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.epochs = epochs
        self.device = device

    def train_epoch(self, epoch):
        self.model.train()
        running_loss = 0.0
        correct_predictions = 0
        total_samples = 0
        
        progress_bar = tqdm(self.train_loader, desc=f"Epoch {epoch + 1}/{self.epochs}", unit="batch")
        for batch in progress_bar:
            texts, labels = batch
            texts, labels = texts.to(self.device), labels.to(self.device)
            self.optimizer.zero_grad()
            outputs = self.model(texts).squeeze()
            loss = self.loss_fn(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss += loss.item()
            correct_predictions += ((outputs > 0.4) == labels).sum().item()
            total_samples += labels.size(0)

        avg_loss = running_loss / len(self.train_loader)
        accuracy = correct_predictions / total_samples
        return avg_loss, accuracy

    def evaluate(self):
        self.model.eval()
        running_loss = 0.0
        correct_predictions = 0
        total_samples = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in self.test_loader:
                texts, labels = batch
                texts, labels = texts.to(self.device), labels.to(self.device)
                outputs = self.model(texts).squeeze()
                loss = self.loss_fn(outputs, labels)

                running_loss += loss.item()
                correct_predictions += ((outputs > 0.4) == labels).sum().item()
                total_samples += labels.size(0)

                # Store predictions and true labels for F1 calculation
                preds = (outputs > 0.4).float()
                all_preds.append(preds.cpu().numpy())
                all_labels.append(labels.cpu().numpy())

        avg_loss = running_loss / len(self.test_loader)
        accuracy = correct_predictions / total_samples
        
        # Concatenate predictions and labels for F1 calculation
        all_preds = np.concatenate(all_preds)
        all_labels = np.concatenate(all_labels)

        # Calculate Precision, Recall, and F1 Score
        precision = precision_score(all_labels, all_preds)
        recall = recall_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)

        return avg_loss, accuracy, precision, recall, f1

    def train(self):
        for epoch in range(self.epochs):
            train_loss, train_accuracy = self.train_epoch(epoch)
            val_loss, val_accuracy, val_precision, val_recall, val_f1 = self.evaluate()

            # Log metrics to WandB
            wandb.log({
                'train_loss': train_loss,
                'train_accuracy': train_accuracy,
                'val_loss': val_loss,
                'val_accuracy': val_accuracy,
                'val_precision': val_precision,
                'val_recall': val_recall,
                'val_f1_score': val_f1,
                'epoch': epoch
            })

            print(f'Epoch [{epoch+1}/{self.epochs}], '
                  f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, '
                  f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, '
                  f'Val F1 Score: {val_f1:.4f}')

        test_loss, test_accuracy, test_precision, test_recall, test_f1 = self.evaluate()
        print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, '
              f'Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, '
              f'Test F1 Score: {test_f1:.4f}')
        wandb.log({'test_loss': test_loss, 'test_accuracy': test_accuracy,
                   'test_precision': test_precision, 'test_recall': test_recall, 'test_f1_score': test_f1})
        wandb.finish()

    def generate_inference(self, data_loader):
        """Generate predictions on the test dataset and calculate F1, Precision, Recall."""
        self.model.eval()
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in data_loader:
                texts, labels = batch
                texts, labels = texts.to(self.device), labels.to(self.device)

                # Get model output (probabilities)
                outputs = self.model(texts).squeeze()
                
                # Convert probabilities to binary predictions (0 or 1)
                preds = (outputs > 0.4).float()

                all_preds.append(preds.cpu().numpy())
                all_labels.append(labels.cpu().numpy())

        # Concatenate predictions and labels for evaluation
        all_preds = np.concatenate(all_preds)
        all_labels = np.concatenate(all_labels)

        # Calculate accuracy
        accuracy = np.mean(all_preds == all_labels)
        
        # Calculate Precision, Recall, and F1 Score
        precision = precision_score(all_labels, all_preds)
        recall = recall_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)

        # Log the metrics to WandB
        wandb.log({
            'f1_score': f1,
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy
        })

        return all_preds, accuracy, precision, recall, f1


In [7]:
class RuBERTProcessor:
    def __init__(self, max_length=128, device='cpu'):
        self.tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
        self.model = BertModel.from_pretrained('DeepPavlov/rubert-base-cased').to(device)

        self.device = device
        self.max_length = max_length

    def encode(self, texts):
        """Tokenize and encode texts using RuBERT tokenizer."""
        encodings = self.tokenizer(texts, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')
        return encodings['input_ids'], encodings['attention_mask']

    def get_embeddings(self, input_ids, attention_mask):
        """Get embeddings from the RuBERT model."""
        
        with torch.no_grad():
            outputs = self.model(input_ids.to(self.device), attention_mask=attention_mask.to(self.device))
        return outputs.last_hidden_state # Use mean of token embeddings        

# Custom dataset to handle on-the-fly embedding generation
class TextDataset(Dataset):
    def __init__(self, texts, labels, processor):
        self.texts = texts
        self.labels = labels
        self.processor = processor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Get the embeddings on-the-fly when needed
        input_ids, attention_mask = self.processor.encode(text)
        embeddings = self.processor.get_embeddings(input_ids, attention_mask)

        return embeddings, torch.tensor(label).float()

def collate_fn(batch):
    # batch is a list of (embedding, label) tuples
    embeddings, labels = zip(*batch)
    embeddings = [emb.squeeze(0) for emb in embeddings]

    pad_token_value = 0  # Replace with your specific pad token if different
    
    # Convert to tensor and pad sequences
    embeddings = torch.nn.utils.rnn.pad_sequence(
        embeddings, batch_first=True, padding_value=pad_token_value
    )

    # Convert labels to a tensor (modify as necessary if labels need specific formatting)
    labels = torch.tensor(labels)
    
    return embeddings, labels
    
def load_data(train_texts, val_texts, train_labels, val_labels, device='cpu'):
    processor = RuBERTProcessor(max_length=128, device=device)
    
    train_dataset = TextDataset(train_texts, train_labels, processor)
    val_dataset = TextDataset(val_texts, val_labels, processor)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
    return train_loader, val_loader, processor



In [8]:
wandb.init(project='wb-cnn-rubert-emb-text-classification', entity='luezzka')


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Split dataset into training and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), 
    df['label'].tolist(), 
    test_size=0.05,
    shuffle=True,
    stratify=df['label'].tolist(),
    random_state=42
)

train_loader, val_loader, processor = load_data(train_texts, val_texts, train_labels, val_labels, device)
# Instantiate model
model = CNNModel(embedding_dim=768, num_filters=1000, kernel_sizes=[3, 4, 5], num_classes=1)
# Loss function and optimizer
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move model to GPU if available
model.to(device)

# Train the model
trainer = Trainer(model, train_loader, val_loader, loss_fn, optimizer, epochs=5, device=device)
trainer.train()

# Perform inference on the test dataset
preds, accuracy, precision, recall, f1 = trainer.generate_inference(val_loader)

[34m[1mwandb[0m: Currently logged in as: [33mluizanigogosova[0m ([33mluezzka[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241114_142614-xowgx8vi[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mbreezy-waterfall-36[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/luezzka/wb-cnn-rubert-emb-text-classification[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/luezzka/wb-cnn-rubert-emb-text-classification/runs/xowgx8vi[0m


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1/5: 100%|██████████| 3565/3565 [37:15<00:00,  1.59batch/s]


Epoch [1/5], Train Loss: 0.0907, Train Accuracy: 0.9698, Val Loss: 0.0680, Val Accuracy: 0.9778, Val F1 Score: 0.9094


Epoch 2/5: 100%|██████████| 3565/3565 [36:43<00:00,  1.62batch/s]


Epoch [2/5], Train Loss: 0.0613, Train Accuracy: 0.9809, Val Loss: 0.0698, Val Accuracy: 0.9785, Val F1 Score: 0.9084


Epoch 3/5: 100%|██████████| 3565/3565 [36:33<00:00,  1.63batch/s]


Epoch [3/5], Train Loss: 0.0492, Train Accuracy: 0.9843, Val Loss: 0.0655, Val Accuracy: 0.9790, Val F1 Score: 0.9151


Epoch 4/5: 100%|██████████| 3565/3565 [35:22<00:00,  1.68batch/s]


Epoch [4/5], Train Loss: 0.0391, Train Accuracy: 0.9876, Val Loss: 0.0707, Val Accuracy: 0.9804, Val F1 Score: 0.9179


Epoch 5/5: 100%|██████████| 3565/3565 [34:50<00:00,  1.71batch/s]


Epoch [5/5], Train Loss: 0.0310, Train Accuracy: 0.9902, Val Loss: 0.0689, Val Accuracy: 0.9807, Val F1 Score: 0.9206
Test Loss: 0.0689, Test Accuracy: 0.9807, Test Precision: 0.9360, Test Recall: 0.9057, Test F1 Score: 0.9206


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:          epoch ▁▃▅▆█
[34m[1mwandb[0m:  test_accuracy ▁
[34m[1mwandb[0m:  test_f1_score ▁
[34m[1mwandb[0m:      test_loss ▁
[34m[1mwandb[0m: test_precision ▁
[34m[1mwandb[0m:    test_recall ▁
[34m[1mwandb[0m: train_accuracy ▁▅▆▇█
[34m[1mwandb[0m:     train_loss █▅▃▂▁
[34m[1mwandb[0m:   val_accuracy ▁▃▄▇█
[34m[1mwandb[0m:   val_f1_score ▂▁▅▆█
[34m[1mwandb[0m:       val_loss ▄▇▁█▆
[34m[1mwandb[0m:  val_precision ▁█▁▇▄
[34m[1mwandb[0m:     val_recall ▆▁█▄▇
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:          epoch 4
[34m[1mwandb[0m:  test_accuracy 0.98068
[34m[1mwandb[0m:  test_f1_score 0.9206
[34m[1mwandb[0m:      test_loss 0.06893
[34m[1mwandb[0m: test_precision 0.93598
[34m[1mwandb[0m:    test_recall 0.90572
[34m[1mwandb[0m: train_ac

Error: You must call wandb.init() before wandb.log()

In [None]:
torch.save(model.state_dict(), '/kaggle/working/out_model.pth')

In [None]:
test_df = pd.read_csv('/kaggle/input/wb_winter_24/test.csv', index_col='ID')
test_df = all_preprocessing(test_df)

In [None]:
model = CNNModel(embedding_dim=768, num_filters=1000, kernel_sizes=[3, 4, 5], num_classes=1).to(device)
model.load_state_dict(torch.load('/kaggle/working/out_model.pth'))
model.eval()  # Set the model to evaluation mode

def predict(texts):
    all_preds = []

    with torch.no_grad():
        for text in texts:
            input_ids, attention_mask = self.processor.encode(text)
            embeddings = self.processor.get_embeddings(input_ids, attention_mask).to(device)
            
            min_length = 5
            # Ensure sequences are padded to at least `min_length`
            if embeddings.size(1) < min_length:
                pad_size = min_length - padded_sequences.size(1)
                padding = torch.full((0, pad_size, 0), 0)
                embeddings = torch.cat((embeddings, padding), dim=1)
                print(embeddings.shape)

            # Get model output (probabilities)
            outputs = self.model(embeddings).squeeze()

            # Convert probabilities to binary predictions (0 or 1)
            preds = (outputs > 0.4).float()

            all_preds.append(preds.cpu().numpy())

    return all_preds

test_df['label'] = predict(test_df['text'].tolist())
test_df = test_df.drop(columns='text')
test_df.to_csv('/kaggle/working/test_predicted.csv')