# Dependencies

In [4]:
!pip install pandas
!pip install transformers
!pip install --upgrade jupyter ipywidgets
!pip install torch torchvision torchaudio
!pip install tensorflow
!pip install flax
!pip install evaluate

Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
     ---------------------------------------- 0.0/138.0 kB ? eta -:--:--
     -- ------------------------------------- 10.2/138.0 kB ? eta -:--:--
     -------------------------------------- 138.0/138.0 kB 2.1 MB/s eta 0:00:00
Collecting filelock (from transformers)
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Using cached huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.19.3->transformers)
  Using cached fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Downloading transformers-4.40.1-py3-none-any.

# Import data

In [5]:
import pandas as pd

data = pd.read_csv("../assign1/datasets/small_data_sample.csv")
data

Unnamed: 0,text,label
0,"Explosion Rocks Baghdad Neighborhood BAGHDAD, ...",0
1,BBC reporters' log BBC correspondents record e...,0
2,Israel welcomes Rice nomination; Palestinians ...,0
3,Medical Journal Calls for a New Drug Watchdog ...,0
4,Militants Kidnap Relatives of Iraqi Minister-T...,0
...,...,...
39995,Microsoft Tests MSN Messenger 7.0 Updated chat...,3
39996,Vonage talks of 911 advancements In Rhode Isla...,3
39997,Exploring Venus: The Hothouse Planet Before sp...,3
39998,PayPal to Impose Fines for Breaking Bans - On...,3


---

In [6]:
from datasets import Dataset

dataset_hf = Dataset.from_pandas(data)

In [7]:
from datasets import DatasetDict

# 90% train, 10% test+validation
train_test = dataset_hf.train_test_split(test_size=0.1)

# Split the 10% test+validation set in half test, half validation
valid_test = train_test['test'].train_test_split(test_size=0.5)

train_valid_test_dataset = DatasetDict({
    'train': train_test['train'],
    'test': valid_test['test'],
    'validation': valid_test['train']
})

---

| Model Label | Model Label Value | -> | Data Label | Data Label Value |
| -------- | -------- | -------- | -------- | -------- |
| 0 | Sports | -> | 1 | Sports |
| 1 | Arts, Culture, and Entertainment | -> | 0 | World |
| 2 | Business and Finance | -> | 2 | Business |
| 3 | Health and Wellness | -> | 0 | World |
| 4 | Lifestyle and Fashion | -> | 0 | World |
| 5 | Science and Technology | -> | 3 | Sci/Tech |
| 6 | Politics | -> | 0 | World |
| 7 | Crime | -> | 0 | World |

In [8]:
label_dict = {
    0: 'World', 
    1: 'Sports', 
    2: 'Business',
    3: 'Sci/Tech'
}

In [9]:
model_name = "dstefa/roberta-base_topic_classification_nyt_news"

predictions_dict = {
    "Sports": 1,
    "Arts, Culture, and Entertainment": 0,
    "Business and Finance": 2,
    "Health and Wellness": 0,
    "Lifestyle and Fashion": 0,
    "Science and Technology": 3,
    "Politics": 0,
    "Crime": 0
}

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def print_metrics(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred, average='macro'))
    print('Recall: ', recall_score(y_test, y_pred, average='macro'))
    print('F1: ', f1_score(y_test, y_pred, average='macro'))

# Transformer

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TextClassificationPipeline

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the text classification pipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [12]:
# Apply the pipeline to get predictions
predictions = data['text'].apply(lambda x: pipe(x)[0]['label'])

# Transform prediction to integers using label_dict
data['predictions_label'] = predictions.apply(lambda x: predictions_dict[x])

In [None]:
# Print metrics
print_metrics(data['label'], data['predictions_label'])

[[2 0 2 0]
 [0 4 0 0]
 [0 0 5 0]
 [1 0 4 2]]
Accuracy:  0.65
Precision:  0.7803030303030303
Recall:  0.6964285714285714
F1:  0.660218253968254


# Fine-tuning 

## Tokenize dataset

In [None]:
def preprocess_function(sample):
    return tokenizer(sample["text"], truncation=True)

In [None]:
tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})

## Train the model

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from evaluate import load
import numpy as np

metric = load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [None]:
trainer.train()

  0%|          | 0/1689 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.5553, 'learning_rate': 1.4079336885731204e-05, 'epoch': 0.89}


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.3791920244693756, 'eval_accuracy': 0.868, 'eval_runtime': 85.0673, 'eval_samples_per_second': 5.878, 'eval_steps_per_second': 0.376, 'epoch': 1.0}


In [None]:
trainer.evaluate()

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.0839295387268066,
 'eval_accuracy': 0.0,
 'eval_runtime': 0.237,
 'eval_samples_per_second': 4.219,
 'eval_steps_per_second': 4.219,
 'epoch': 3.0}

In [None]:
trainer.predict(test_dataset=tokenized_dataset["test"])

  0%|          | 0/1 [00:00<?, ?it/s]

[[ 8.268792   -0.4930358   0.16729748 -2.335821   -0.8009019  -2.5005388
  -1.7868569  -0.05294482]] [1]


PredictionOutput(predictions=array([[ 8.268792  , -0.4930358 ,  0.16729748, -2.335821  , -0.8009019 ,
        -2.5005388 , -1.7868569 , -0.05294482]], dtype=float32), label_ids=array([1], dtype=int64), metrics={'test_loss': 8.762733459472656, 'test_accuracy': 0.0, 'test_runtime': 0.1645, 'test_samples_per_second': 6.08, 'test_steps_per_second': 6.08})

## Save the model

In [None]:
trainer.save_model()