In [1]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [2]:
train_data = pd.read_csv('/kaggle/input/fake-news-ru-dataset/train.tsv', sep='\t')
test_data = pd.read_csv('/kaggle/input/fake-news-ru-dataset/test.tsv', sep='\t')

In [3]:
df = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [18]:
df["is_fake"].describe()

count    6758.000000
mean        0.426014
std         0.494532
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: is_fake, dtype: float64

In [4]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["title"].tolist(), df["is_fake"].tolist(), test_size=0.2, random_state=42
)

In [5]:
!export WANDB_DISABLED=true

In [6]:
import os
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_DISABLED"] = "true"


In [7]:
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Создание датасетов
train_dataset = FakeNewsDataset(train_texts, train_labels, tokenizer)
val_dataset = FakeNewsDataset(val_texts, val_labels, tokenizer)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [8]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [9]:
#base setup 
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
    
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=10,
#     load_best_model_at_end=True
# )

In [39]:
# Загрузка модели
model = BertForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=2)

# Определение параметров тренировки
training_args = TrainingArguments(
    output_dir="./results/rubert-fake-news-classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    
    num_train_epochs=1,
    weight_decay=0.01,
    learning_rate=1e-5,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1-score": f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [40]:
# Обучение модели
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
1,0.6126,0.383399,0.857988,0.81761,0.872483,0.844156


TrainOutput(global_step=676, training_loss=0.41696186143265673, metrics={'train_runtime': 106.2748, 'train_samples_per_second': 50.868, 'train_steps_per_second': 6.361, 'total_flos': 355594591319040.0, 'train_loss': 0.41696186143265673, 'epoch': 1.0})

In [41]:
results = trainer.evaluate()
print("Model Evaluation:", results)

Model Evaluation: {'eval_loss': 0.38339897990226746, 'eval_accuracy': 0.8579881656804734, 'eval_precision': 0.8176100628930818, 'eval_recall': 0.87248322147651, 'eval_f1-score': 0.8441558441558441, 'eval_runtime': 5.9687, 'eval_samples_per_second': 226.514, 'eval_steps_per_second': 28.314, 'epoch': 1.0}


In [42]:
!pip install huggingface_hub



In [43]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [44]:
from huggingface_hub import HfApi

repo_name = "tellowit/rubert-fake-news-classification"

model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.txt',
 './final_model/added_tokens.json')