In [1]:
!pip install pymorphy2 
!pip install --upgrade datasets
!pip install peft

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844
Collecting datasets
  Down

In [2]:
import pandas as pd

import torch
from transformers import AutoModelForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from peft import LoraConfig

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import datasets

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words("russian")

import re

import pymorphy2
morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean_text(text):
    # Remove URLs, numbers, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # URLs
    text = re.sub(r'\d+', '', text)  # Numbers
    text = re.sub(r'[^\w\s]', '', text)  # Special characters
    return text

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    # remove stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # lemmatize
    def lemmatize(tokens):
        return [morph.parse(word)[0].normal_form for word in tokens]
    tokens = lemmatize(tokens)
    return ' '.join(tokens)

def all_preprocessing(df):
    df['text'] = df['text'].apply(clean_text)
    df['text'] = df['text'].apply(preprocess_text)
    return df

In [4]:
import wandb

wandb.login(key='b123af3ff1bc7e54569d0976c6405a5b3b6d2902')

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
df = pd.read_csv('/kaggle/input/wb_winter_24/train.csv', index_col='ID')
#df = all_preprocessing(df)
df.head()

Unnamed: 0_level_0,text,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"Брюки отличные, качественные, но к сожалению к...",0
1,"Отличный аппарат, в комплекте кабель и работет...",0
2,Супер 👍 спасибо большое,0
3,Получил быстро данные наушники! К наушникам пр...,0
4,Всё дошло в целости и сохранности),0


# Prepare dataset

In [6]:
# Split dataset into training and validation
train_texts, train_labels = df['text'].tolist(), df['label'].tolist()
tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment')

# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)

# Define compute metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='weighted'),
        'precision': precision_score(p.label_ids, preds, average='weighted'),
        'recall': recall_score(p.label_ids, preds, average='weighted')
    }

tokenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

# BERT + LoRA Trainer

In [7]:
train_dataset = datasets.Dataset.from_pandas(df)

# Tokenization (use your tokenizer)
def tokenize_function(examples):
    tokenized = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
    return tokenized

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/240159 [00:00<?, ? examples/s]

In [8]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = AutoModelForSequenceClassification.from_pretrained(
    'blanchefort/rubert-base-cased-sentiment', num_labels=2, ignore_mismatched_sizes=True  # Assuming binary classification
)

lora_config = LoraConfig(
    r=64,  # LoRA rank
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["query", "value"]  # LoRA will only apply to these modules in the model
)

# Wrap the model with LoRA
model.add_adapter(lora_config, adapter_name='adapter')
model.to(device)
model.train()

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at blanchefort/rubert-base-cased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total parameters: 180214274
Trainable parameters: 2359296


In [9]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/results',
    max_steps=10000,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=1e-4,
    eval_strategy="steps",
    save_strategy='steps',
    eval_steps=2000,
    save_steps=2000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('/kaggle/working/fine_tuned_model')
tokenizer.save_pretrained('/kaggle/working/fine_tuned_model')


max_steps is given, it will override any value given in num_train_epochs
[34m[1mwandb[0m: Currently logged in as: [33mluizanigogosova[0m ([33mluezzka[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241107_101445-ttufive8[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m/kaggle/working/results[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/luezzka/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/luezzka/huggingface/runs/ttufive8[0m


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
2000,0.2591,0.246362,0.886026,0.89469,0.910635,0.886026
4000,0.1926,0.186554,0.928252,0.931474,0.937317,0.928252
6000,0.1555,0.146074,0.95844,0.957669,0.957331,0.95844
8000,0.1454,0.129756,0.96357,0.962981,0.962729,0.96357
10000,0.1311,0.127065,0.964553,0.964046,0.963805,0.964553




('/kaggle/working/fine_tuned_model/tokenizer_config.json',
 '/kaggle/working/fine_tuned_model/special_tokens_map.json',
 '/kaggle/working/fine_tuned_model/vocab.txt',
 '/kaggle/working/fine_tuned_model/added_tokens.json',
 '/kaggle/working/fine_tuned_model/tokenizer.json')

In [10]:
test_df = pd.read_csv('/kaggle/input/wb_winter_24/test.csv', index_col='ID')
#test_df = all_preprocessing(test_df)

model.eval()  # Set model to evaluation mode

def predict(text):
    # Tokenize input text
    inputs = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    return predictions.item()  # Return predicted label (0 or 1 for binary classification)

test_df['label'] = test_df['text'].apply(predict)
test_df = test_df.drop(columns='text')
test_df.to_csv('/kaggle/working/test_predicted.csv')

# BERT embeddings + CNN