# Phase1: Prepare Data

In [3]:
# !pip install datasets
! pip install -U datasets huggingface_hub fsspec
# ! pip install --quiet --upgrade transformers huggingface_hub
# ! pip install -U datasets evaluate wandb
# ! pip install fsspec==2023.9.2

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [4]:
from datasets import load_dataset, Features, Value
import numpy as np
import pandas as pd
import os
np.random.seed(42)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


## load and divide dataset

In [5]:
dataset = load_dataset("dbpedia_14")
train= dataset['train'].shuffle(seed=42).select(range(28000))
test_full = dataset['test'].shuffle(seed=42).select(range(28000))

validation_indexes = []
for label in set(test_full['label']):
    label_indices=[]
    for i,l in enumerate(test_full['label']):
      if l == label: label_indices.append(i)
    selected_indexes = np.random.choice(label_indices, min(1000, len(label_indices)), replace=False)
    validation_indexes.extend(selected_indexes)


val = test_full.select(validation_indexes)

remained_test_indexes = [i for i in range(len(test_full)) if i not in validation_indexes]
test = test_full.select(remained_test_indexes)


print(len(train))
print(len(test))
print(len(val))
print(pd.Series(train['label']).value_counts())
print(pd.Series(test['label']).value_counts())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


28000
14000
14000
5     2081
3     2037
6     2032
1     2014
12    2012
11    2003
13    2000
9     1999
10    1998
0     1987
8     1986
2     1981
7     1965
4     1905
Name: count, dtype: int64
4     1075
13    1038
12    1028
8     1019
11    1017
9     1009
2     1006
0      986
10     982
7      978
5      978
6      971
1      968
3      945
Name: count, dtype: int64


## preprocess

In [6]:
label_names = dataset['train'].features['label'].names
label_map = {i: name for i, name in enumerate(label_names)}
label_map

{0: 'Company',
 1: 'EducationalInstitution',
 2: 'Artist',
 3: 'Athlete',
 4: 'OfficeHolder',
 5: 'MeanOfTransportation',
 6: 'Building',
 7: 'NaturalPlace',
 8: 'Village',
 9: 'Animal',
 10: 'Plant',
 11: 'Album',
 12: 'Film',
 13: 'WrittenWork'}

In [7]:
from transformers import T5Tokenizer
import re

In [8]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def remove_space(text):
  text = re.sub(r'\s+', ' ', text)
  return text.strip()

def preprocess_data_check(row):
    input = f"what is the label for: {remove_space(row['content'])}"
    label = label_map[row['label']]

    row['label']=label
    row['content']=input
    return row


def preprocess_data(examples):
    inputs = [f"what is the label for: {remove_space(text)}" for text in examples['content']]
    # Map numeric labels
    labels = [label_map[label] for label in examples['label']]

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Tokenize labels
    model_labels = tokenizer(
        labels,
        max_length=8,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    model_inputs["labels"] = model_labels["input_ids"]
    return model_inputs


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
print(preprocess_data_check(val[0]))


{'label': 'Company', 'title': 'Taishin Futures Co. Ltd.', 'content': 'what is the label for: Taishin Futures Co. Ltd. was founded in 1997 as a joint venture between Taishin Securities Co. Ltd. and Taishin International Bank and is a member of the Taishin Financial Holdings. It involves in futures including brokerage consultation management and dealer business. Taishin Futures Co. Ltd. is a member of Taiwan Futures Exchange.'}


In [10]:
train_dataset = train.map(preprocess_data, batched=True)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

val_dataset = val.map(preprocess_data, batched=True)
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

test_dataset = test.map(preprocess_data, batched=True)
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])



Map:   0%|          | 0/28000 [00:00<?, ? examples/s]

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

#Phase2: wandb

In [11]:
!pip install wandb -qU

In [12]:
import wandb
import random
import math

In [13]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mkiarash-astanboos[0m ([33mkiarash-astanboos-ferdowsi-university-of-mashhad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [14]:
wandb.init(
      # Set the project where this run will be logged
      project="Neural Network LLM HW5",
      # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      name=f"final",
)

# Phase3: Train model

In [15]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorWithPadding


In [16]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
output_dir="./t5_dbpedia_results",
eval_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=2,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=500,
#https://www.kdnuggets.com/how-to-fine-tune-t5-for-question-answering-tasks-with-hugging-face-transformers

save_strategy="epoch",
load_best_model_at_end=True,
report_to="wandb",
fp16=True,
# fp16=False,
# bf16=True

)

In [17]:
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
)

trainer.train()

trainer.save_model("./t5_dbpedia_finetuned")


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0197,0.010587
2,0.014,0.009021
3,0.012,0.008901


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


# Phase4: Evaluation

In [18]:
!pip install python-Levenshtein



## test

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.metrics import roc_curve, roc_auc_score
from scipy.special import softmax

In [20]:
torch.cuda.empty_cache()

In [21]:
# ram problem
def evaluate_in_chunks(trainer, dataset, chunk_size=1000):
    all_pred_ids = []
    all_true_ids = []
    all_logits = []

    for i in range(0, len(dataset), chunk_size):
        chunk = dataset.select(range(i, min(i + chunk_size, len(dataset))))
        predictions = trainer.predict(chunk)
        pred_ids = np.argmax(predictions.predictions[0], axis=-1)
        all_pred_ids.extend(pred_ids)
        all_true_ids.extend(predictions.label_ids)
        all_logits.append(predictions.predictions[0])
        torch.cuda.empty_cache()

    concatenated_logits = np.concatenate(all_logits, axis=0) if all_logits else np.array([])
    return all_pred_ids, all_true_ids, concatenated_logits

In [None]:
pred_ids, true_ids, logits = evaluate_in_chunks(trainer, test_dataset, chunk_size=1000)
decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True).strip().lower() for pred in pred_ids]
decoded_true = [tokenizer.decode(true, skip_special_tokens=True).strip().lower() for true in true_ids]

def map_to_closest_label(decoded, label_names):
    decoded = decoded.strip().lower()
    #  exact match 
    for label in label_names:
        if decoded == label.lower():
            return label
    #  Levenshtein distance
    return min(label_names, key=lambda x: levenshtein_distance(decoded, x.lower()))

pred_labels = [map_to_closest_label(pred, label_names) for pred in decoded_preds]
true_labels = [map_to_closest_label(true, label_names) for true in decoded_true]

logger.info(f"Test set true label counts: {pd.Series(true_labels).value_counts()}")
logger.info(f"Test set predicted label counts: {pd.Series(pred_labels).value_counts()}")

In [None]:
report = classification_report(true_labels, pred_labels, labels=label_names, output_dict=True)
accuracy = report['accuracy']
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1 = report['weighted avg']['f1-score']

wandb.log({
    "test_accuracy": accuracy,
    "test_precision": precision,
    "test_recall": recall,
    "test_f1": f1
})

In [None]:
cm = confusion_matrix(true_labels, pred_labels, labels=label_names)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("confusion_matrix.png")
wandb.log({"confusion_matrix": wandb.Image("confusion_matrix.png")})
plt.close()


In [1]:
metrics_table = wandb.Table(columns=["Metric", "Value"])
metrics_table.add_data("Accuracy test", accuracy)
metrics_table.add_data("Precision test", precision)
metrics_table.add_data("Recall test", recall)
metrics_table.add_data("F1-Score test", f1)
wandb.log({"metrics_table": metrics_table})

NameError: name 'wandb' is not defined

In [None]:
class_metrics_table = wandb.Table(columns=["Class", "Precision", "Recall", "F1-Score"])
for label in label_names:
    if label in report:
        class_metrics_table.add_data(label, report[label]['precision'], report[label]['recall'], report[label]['f1-score'])
wandb.log({"class_metrics": class_metrics_table})

In [None]:
per_class_accuracy = []
for i, label in enumerate(label_names):
    true_binary = [1 if true == label else 0 for true in true_labels]
    pred_binary = [1 if pred == label else 0 for pred in pred_labels]
    if sum(true_binary) > 0:
        acc = accuracy_score(true_binary, pred_binary)
        per_class_accuracy.append(acc)
    else:
        per_class_accuracy.append(0)

plt.figure(figsize=(12, 6))
plt.bar(label_names, per_class_accuracy, color='skyblue')
plt.title("Per-Class Accuracy on Test Set")
plt.xlabel("Class")
plt.ylabel("Accuracy")
plt.xticks(rotation=45, ha='right')
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig("per_class_accuracy.png")
wandb.log({"per_class_accuracy": wandb.Image("per_class_accuracy.png")})
plt.close()


## train

In [None]:
torch.cuda.empty_cache()

In [None]:
pred_ids, true_ids, logits = evaluate_in_chunks(trainer, train_dataset, chunk_size=1000)
decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True).strip().lower() for pred in pred_ids]
decoded_true = [tokenizer.decode(true, skip_special_tokens=True).strip().lower() for true in true_ids]

def map_to_closest_label(decoded, label_names):
    decoded = decoded.strip().lower()
    # exact match 
    for label in label_names:
        if decoded == label.lower():
            return label
    #  Levenshtein distance
    return min(label_names, key=lambda x: levenshtein_distance(decoded, x.lower()))

pred_labels = [map_to_closest_label(pred, label_names) for pred in decoded_preds]
true_labels = [map_to_closest_label(true, label_names) for true in decoded_true]

logger.info(f"Train set true label counts: {pd.Series(true_labels).value_counts()}")
logger.info(f"Train set predicted label counts: {pd.Series(pred_labels).value_counts()}")

In [None]:
report = classification_report(true_labels, pred_labels, labels=label_names, output_dict=True)
accuracy = report['accuracy']
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1 = report['weighted avg']['f1-score']

wandb.log({
    "train_accuracy": accuracy,
    "train_precision": precision,
    "train_recall": recall,
    "train_f1": f1
})

In [None]:
cm = confusion_matrix(true_labels, pred_labels, labels=label_names)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)
plt.title("Confusion Matrix Train")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("confusion_matrix_train.png")
wandb.log({"confusion_matrix": wandb.Image("confusion_matrix_train.png")})
plt.close()


In [None]:
metrics_table = wandb.Table(columns=["Metric", "Value"])
metrics_table.add_data("Accuracy train", accuracy)
metrics_table.add_data("Precision train", precision)
metrics_table.add_data("Recall train", recall)
metrics_table.add_data("F1-Score train", f1)
wandb.log({"metrics_table": metrics_table})

In [None]:
class_metrics_table = wandb.Table(columns=["Class", "Precision", "Recall", "F1-Score"])
for label in label_names:
    if label in report:
        class_metrics_table.add_data(label, report[label]['precision'], report[label]['recall'], report[label]['f1-score'])
wandb.log({"class_metrics": class_metrics_table})

In [None]:
per_class_accuracy = []
for i, label in enumerate(label_names):
    true_binary = [1 if true == label else 0 for true in true_labels]
    pred_binary = [1 if pred == label else 0 for pred in pred_labels]
    if sum(true_binary) > 0:
        acc = accuracy_score(true_binary, pred_binary)
        per_class_accuracy.append(acc)
    else:
        per_class_accuracy.append(0)

plt.figure(figsize=(12, 6))
plt.bar(label_names, per_class_accuracy, color='skyblue')
plt.title("Per-Class Accuracy on train Set")
plt.xlabel("Class")
plt.ylabel("Accuracy")
plt.xticks(rotation=45, ha='right')
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig("per_class_accuracy.png")
wandb.log({"per_class_accuracy": wandb.Image("per_class_accuracy.png")})
plt.close()


In [None]:
wandb.finish()