In [49]:
import os
import re
import torch
import evaluate
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [50]:
# paths
filename = os.path.join("data", "combined_data.csv")

In [51]:
# load data
df = pd.read_csv(filename)

In [52]:
# cleaning
df['combined_text'] = df['title'] + ' ' + df['text']

# Clean up unused columns
df = df.drop(columns=['date', 'title', 'text'])

# remove duplicates
df = df.drop_duplicates()

In [53]:
# lowercase
df["combined_text"] = df["combined_text"].str.lower()

In [55]:
# split data in train and test
texts = df["combined_text"]
labels = df["label"]
train_df, test_df = train_test_split(
                                    df, 
                                    test_size=0.2,     # create an 80/20 split
                                    random_state=42, 
                                    stratify=df['label']    # keeps label balance in both splits
)

In [64]:
# fit tokenizer and classification model
model_name = "roberta-base"  # or the lighter "bert-base-uncased" / "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_df.map(tokenize, batched=True)
test_dataset = test_df.map(tokenize, batched=True)

train_dataset = train_dataset.remove_columns(["text"]) # removing raw text column bc the model doesn’t need it
test_dataset = test_dataset.remove_columns(["text"])

# Set the format for PyTorch
train_dataset.set_format("torch")
test_dataset.set_format("torch")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs"
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: tokenize() got an unexpected keyword argument 'batched'

In [None]:
# predict and evaluate
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds)}

In [None]:
# train and evaluate
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()