In [None]:
!pip install torch transformers



In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW

def fine_tune_roberta_for_rumor_detection(train_texts, train_labels, val_texts, val_labels, epochs=3, batch_size=8, learning_rate=2e-5):
    # Initialize RoBERTa tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    model.to('cuda')

    # Tokenize and encode the training and validation sets
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')

    # Move tensors to GPU
    train_encodings = {key: value.to('cuda') for key, value in train_encodings.items()}
    val_encodings = {key: value.to('cuda') for key, value in val_encodings.items()}

    # Convert labels to PyTorch tensors
    train_labels = torch.tensor(train_labels).to('cuda')
    val_labels = torch.tensor(val_labels).to('cuda')

    # Create PyTorch datasets
    train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
    val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

    # Create PyTorch data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Set up optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * epochs
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=learning_rate, total_steps=total_steps)

    # Training loop
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Validation
        model.eval()
        val_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = batch
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                correct_predictions += (predictions == labels).sum().item()
                total_samples += labels.size(0)

        average_val_loss = val_loss / len(val_loader)
        accuracy = correct_predictions / total_samples

        print(f'Epoch {epoch + 1}/{epochs}, Val Loss: {average_val_loss}, Accuracy: {accuracy}')

    return model

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d bjoernjostein/fake-news-data-set

Downloading fake-news-data-set.zip to /content
 76% 41.0M/53.8M [00:00<00:00, 49.8MB/s]
100% 53.8M/53.8M [00:00<00:00, 56.8MB/s]


In [None]:
!unzip fake-news-data-set.zip

Archive:  fake-news-data-set.zip
  inflating: sample_submission.csv   
  inflating: test/test.csv           
  inflating: train/train.csv         
  inflating: val/val.csv             


In [None]:
import pandas as pd

file_path = '/content/train/train.csv'
train_df = pd.read_csv(file_path)
print(train_df.head())

                                                text label
0  The court granted by a 5-4 vote a request made...  real
1  " Pennsylvania was a crucial swing state in th...  real
2  The company today is rolling out an update to ...  fake
3  When it comes to trade policy, Hillary Clinton...  real
4  S. stocks had their worst April start since 19...  real


In [None]:
file_path = '/content/val/val.csv'
val_df = pd.read_csv(file_path)
print(val_df.head())

                                                text label
0  Police investigating Saturday's fatal shooting...  real
1  A car bomb in Turkey's eastern province of Ela...  real
2  Democratic presidential candidate Hillary Clin...  real
3  Generally, each party gets two turns to decide...  fake
4  Securitas CEO Alf Goransson told Reuters: * Se...  real


In [None]:
train_text = train_df.iloc[:, 0].tolist()
train_labels = train_df.iloc[:, 1].tolist()
val_text = val_df.iloc[:, 0].tolist()
val_labels = val_df.iloc[:, 1].tolist()

In [None]:
for i, label in enumerate(train_labels):
  train_labels[i] = 0 if label == 'real' else 1

for i, label in enumerate(val_labels):
  val_labels[i] = 0 if label == 'real' else 1

In [None]:
model = fine_tune_roberta_for_rumor_detection(train_text[0:1000], train_labels[0:1000], val_text[0:1000], val_labels[0:1000])

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Val Loss: 0.06791392429172993, Accuracy: 0.98
Epoch 2/3, Val Loss: 0.03762195019610226, Accuracy: 0.988
Epoch 3/3, Val Loss: 0.07925761113222689, Accuracy: 0.98
