In [None]:
import pandas as pd
import re
import spacy
import nltk
import torch
import string
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
df = pd.read_csv("../HomeWork1/nyt.csv")
print(df.shape)
df.head()

In [None]:
def label_to_number(label):
    mapping = {"business": 0, "politics": 1, "sports": 2}
    return mapping.get(label, -1)

df["label"] = df["label"].apply(label_to_number)

In [None]:
!python -m spacy download en_core_web_sm
sp = spacy.load("en_core_web_sm")

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")

nltk_st = stopwords.words("english")

In [None]:
def clean(text, http=True, punc=True, lem=True, stop_w=True):
    if http == True:
        text = re.sub("https?:\/\/t.co\/[A-Za-z0-9]*", "", text)
    if stop_w == True:
        text = [word for word in word_tokenize(text) if not word.lower() in nltk_st]
        text = " ".join(text)
    if lem == True:
        lemmatized = [word.lemma_ for word in sp(text)]
        text = " ".join(lemmatized)
    if punc == True:
        text = text.translate(str.maketrans("", "", string.punctuation))
        
    text = text.lower()
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"im", "i am", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'scuse", " excuse", text)
    text = re.sub("\W", " ", text)
    text = re.sub("\s+", " ", text)
    text = text.strip()
    
    return text

In [None]:
%time
DO_PREPROCESS = True
if DO_PREPROCESS:
    df["cleaned_text"] = df["text"].apply(lambda text: clean(text, http=True, punc=False, lem=False, stop_w=False))
else:
    df["cleaned_text"] = df["text"]
df.drop(columns=["text"], axis=1, inplace=True)
df.head()

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(batch):
    return tokenizer(batch["cleaned_text"], truncation=True, padding="max_length", max_length=64)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

print(train_dataset)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
label_weights = {0: 2.0, 1: 2.0, 2: 1.0}

In [None]:
def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    micro_f1 = f1_score(labels, preds, average="micro")
    return {"accuracy": acc, "macro_f1": macro_f1, "micro_f1": micro_f1}

In [None]:
class DataCollator:
    def __call__(self, features):
        model_inputs = [
            {
                "input_ids": feature["input_ids"],
                "attention_mask": feature["attention_mask"],
                "labels": feature["label"]
            } for feature in features
        ]
        batch = tokenizer.pad(
            model_inputs,
            padding="max_length",
            max_length=64,
            return_tensors="pt",
            pad_to_multiple_of=4
        )
        return batch

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, label_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=torch.tensor([self.label_weights[i] for i in range(len(self.label_weights))], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="output",
    bf16=True if torch.cuda.is_bf16_supported() else False,
    fp16=False if torch.cuda.is_bf16_supported() else True,
    learning_rate=4e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    remove_unused_columns=True,
    warmup_ratio=0.1,
    num_train_epochs=3,
    weight_decay=0.001,
    do_eval=True,
    eval_strategy="steps",
    eval_steps=100,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_only_model=True,
    lr_scheduler_type="cosine",
    report_to="none"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollator(),
    compute_metrics=compute_metrics,
    label_weights=label_weights
)

trainer.train()
trainer.evaluate()

In [None]:
predictions = trainer.predict(test_dataset).metrics
print(f"Accuracy Score: {predictions['test_accuracy']: .3f}")
print(f"Macro F1-score: {predictions['test_macro_f1']: .3f}")
print(f"Micro F1-score: {predictions['test_micro_f1']: .3f}")