In [1]:
%run local_functions.py
from local_functions import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re


from datasets import load_dataset
from datasets import Dataset, DatasetDict
from transformers import BertConfig, BertModel


from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
from transformers import TrainingArguments, Trainer


from sklearn.model_selection import train_test_split

plt.style.use("dark_background")

pd.set_option("display.max_columns", 2500)
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_colwidth", 50)


plt.style.use("dark_background")

%load_ext lab_black

2023-08-31 11:09:11.960667: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-31 11:09:11.981243: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df["text"] = df["headline"] + " " + df["short_description"]
df = df[["text", "category"]]
df["text"] = df["text"].apply(text_normalization_3)

SAMPLE_SIZE = 25000

df = df.sample(SAMPLE_SIZE, random_state=42).reset_index(drop=True)

In [3]:
training_df, testing_df = train_test_split(df, test_size=0.25, random_state=42)

In [4]:
encoded_df = pd.get_dummies(
    training_df["category"], columns=["category"], prefix="", prefix_sep=""
)
encoded_df = encoded_df.astype(bool)
encoded_df_con = pd.concat([training_df["text"], encoded_df], axis=1)

# Dataset

In [26]:
train_df, temp_df = train_test_split(encoded_df_con, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [27]:
# Convert the split DataFrames into Datasets
train = Dataset.from_pandas(train_df, split="train")
valid = Dataset.from_pandas(valid_df, split="validation")
test = Dataset.from_pandas(test_df, split="test")

dataset = DatasetDict({"train": train, "validation": valid, "test": test})

In [30]:
labels = [
    label
    for label in dataset["train"].features.keys()
    if label not in ["text", "__index_level_0__"]
]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Tokenize

In [36]:
LM = "bert-base-uncased"

In [37]:
tokenizer = AutoTokenizer.from_pretrained(LM)

In [38]:
def preprocess_data(examples):
    # take a batch of texts
    text = examples["text"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding


encoded_dataset = dataset.map(
    preprocess_data, batched=True, remove_columns=dataset["train"].column_names
)

encoded_dataset.set_format("torch")

Map:   0%|          | 0/13125 [00:00<?, ? examples/s]

Map:   0%|          | 0/2812 [00:00<?, ? examples/s]

Map:   0%|          | 0/2813 [00:00<?, ? examples/s]

In [39]:
model = AutoModelForSequenceClassification.from_pretrained(
    LM,
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
batch_size = 8
metric_name = "f1"

In [41]:
args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    # push_to_hub=True,
)

In [42]:
def multi_label_metrics(
    predictions, labels, threshold=0.5
):  # threshold = confidence threshold, important. 0.5 doesnt always work
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(y_true, y_pred, average="micro")
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [43]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [44]:
trainer.train()



  0%|          | 0/8205 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.1717, 'learning_rate': 1.87812309567337e-05, 'epoch': 0.3}
{'loss': 0.0944, 'learning_rate': 1.7562461913467398e-05, 'epoch': 0.61}
{'loss': 0.0832, 'learning_rate': 1.6343692870201096e-05, 'epoch': 0.91}


  0%|          | 0/352 [00:00<?, ?it/s]

{'eval_loss': 0.07569011300802231, 'eval_f1': 0.35810446957458264, 'eval_roc_auc': 0.6172154182423758, 'eval_accuracy': 0.23648648648648649, 'eval_runtime': 16.6632, 'eval_samples_per_second': 168.755, 'eval_steps_per_second': 21.124, 'epoch': 1.0}
{'loss': 0.0714, 'learning_rate': 1.5124923826934796e-05, 'epoch': 1.22}
{'loss': 0.0649, 'learning_rate': 1.3906154783668494e-05, 'epoch': 1.52}
{'loss': 0.0619, 'learning_rate': 1.2687385740402194e-05, 'epoch': 1.83}


  0%|          | 0/352 [00:00<?, ?it/s]

{'eval_loss': 0.060076575726270676, 'eval_f1': 0.5273446590644636, 'eval_roc_auc': 0.6999314783332755, 'eval_accuracy': 0.40291607396870555, 'eval_runtime': 16.6423, 'eval_samples_per_second': 168.967, 'eval_steps_per_second': 21.151, 'epoch': 2.0}
{'loss': 0.0559, 'learning_rate': 1.1468616697135894e-05, 'epoch': 2.13}
{'loss': 0.0501, 'learning_rate': 1.0249847653869594e-05, 'epoch': 2.44}
{'loss': 0.048, 'learning_rate': 9.03107861060329e-06, 'epoch': 2.74}


  0%|          | 0/352 [00:00<?, ?it/s]

{'eval_loss': 0.055335044860839844, 'eval_f1': 0.565286263860602, 'eval_roc_auc': 0.7205313464941193, 'eval_accuracy': 0.4441678520625889, 'eval_runtime': 15.7393, 'eval_samples_per_second': 178.661, 'eval_steps_per_second': 22.364, 'epoch': 3.0}
{'loss': 0.0465, 'learning_rate': 7.81230956733699e-06, 'epoch': 3.05}
{'loss': 0.04, 'learning_rate': 6.59354052407069e-06, 'epoch': 3.35}
{'loss': 0.0403, 'learning_rate': 5.374771480804388e-06, 'epoch': 3.66}
{'loss': 0.0393, 'learning_rate': 4.156002437538087e-06, 'epoch': 3.96}


  0%|          | 0/352 [00:00<?, ?it/s]

{'eval_loss': 0.05379441753029823, 'eval_f1': 0.5949913644214162, 'eval_roc_auc': 0.7431044651840545, 'eval_accuracy': 0.4900426742532006, 'eval_runtime': 15.7378, 'eval_samples_per_second': 178.678, 'eval_steps_per_second': 22.367, 'epoch': 4.0}
{'loss': 0.0352, 'learning_rate': 2.9372333942717856e-06, 'epoch': 4.27}
{'loss': 0.0339, 'learning_rate': 1.7184643510054846e-06, 'epoch': 4.57}
{'loss': 0.0346, 'learning_rate': 4.996953077391835e-07, 'epoch': 4.88}


  0%|          | 0/352 [00:00<?, ?it/s]

{'eval_loss': 0.05378809571266174, 'eval_f1': 0.5902134080620823, 'eval_roc_auc': 0.7414347916594386, 'eval_accuracy': 0.4868421052631579, 'eval_runtime': 15.7379, 'eval_samples_per_second': 178.677, 'eval_steps_per_second': 22.366, 'epoch': 5.0}
{'train_runtime': 1420.1854, 'train_samples_per_second': 46.209, 'train_steps_per_second': 5.777, 'train_loss': 0.060044924011323456, 'epoch': 5.0}


TrainOutput(global_step=8205, training_loss=0.060044924011323456, metrics={'train_runtime': 1420.1854, 'train_samples_per_second': 46.209, 'train_steps_per_second': 5.777, 'train_loss': 0.060044924011323456, 'epoch': 5.0})

In [45]:
trainer.evaluate()

  0%|          | 0/352 [00:00<?, ?it/s]

{'eval_loss': 0.05379441753029823,
 'eval_f1': 0.5949913644214162,
 'eval_roc_auc': 0.7431044651840545,
 'eval_accuracy': 0.4900426742532006,
 'eval_runtime': 16.5024,
 'eval_samples_per_second': 170.399,
 'eval_steps_per_second': 21.33,
 'epoch': 5.0}

In [46]:
#trainer.save_model("bert-finetuned-news-headlines")

# Eval - Training 23 mins

In [48]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-finetuned-news-headlines"
)
trainer = Trainer(model=model, tokenizer=tokenizer)

In [57]:
df_test = testing_df.reset_index(drop=True)

In [58]:
df_test

Unnamed: 0,text,category
0,the mistake that leads to better brownies most...,FOOD & DRINK
1,after the shooting at spu desolation consolati...,RELIGION
2,gordon ramsay sued over unpaid wages by employ...,BUSINESS
3,fire breaks out at mexico s top refinery 9 peo...,WORLD NEWS
4,20 ways to make your marriage stronger grooms ...,WEDDINGS
...,...,...
6245,scott weiland died from toxic mix of drugs acc...,ENTERTAINMENT
6246,15 reasons why october is the best time to vis...,TRAVEL
6247,teen allegedly held captive is feeling a lot b...,CRIME
6248,take a look at the best animal photos of the week,GREEN


In [88]:
ROWS_TO_EVALUATE = len(df_test)

CONFIDENCE_THRESHOLD = 0.65  # 0.1 works well for low number of non-responses

TOP_N_PREDS = 5  # number of top predictions to return


# make predictions
df_test["predicted_class"] = df_test["text"][0:ROWS_TO_EVALUATE].apply(
    predict_class, args=(tokenizer, trainer, id2label, CONFIDENCE_THRESHOLD)
)  # args: text, tokenizer, trainer, id2label, CONFIDENCE_THRESHOLD=0.5

# calculate if prediction is correct
df_test["correct"] = df_test[0:ROWS_TO_EVALUATE].apply(
    lambda row: int(row["category"] in row["predicted_class"]), axis=1
)
# calculate score (including penalty for guessing multiple categories) used to help find optimal confidence threshold
df_test["correct_w_discount"] = df_test[0:ROWS_TO_EVALUATE].apply(
    multi_cat_guess_penalty,
    axis=1,
    args=(0.9,),  # muli_cat_guess_penalty (somewhere around 0.85 works well)
)

df_test["correct_w_non_preds"] = df_test.apply(multi_positive_outcome, axis=1)

# get top n predictions
df_test["top_n_preds"] = df_test["text"][0:ROWS_TO_EVALUATE].apply(
    n_most_likely_classes, args=(tokenizer, trainer, id2label, TOP_N_PREDS)
)

print(
    f"Percent of predictions that include correct class: {round((df_test.correct.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)
print(
    f"Multi_guess discount score: {round((df_test.correct_w_discount.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)
print(
    f"Multi_positive_outcome discount score: {round((df_test.correct_w_non_preds.sum() / ROWS_TO_EVALUATE)*100, 2)}%"
)

print(
    f"Percent of non-preds: {round((df_test.correct_w_non_preds.sum() - df_test.correct.sum()) / ROWS_TO_EVALUATE*100, 2)}% "
)
print(
    f"Percent of wrong preds: {round((1-(df_test.correct_w_non_preds.sum() / ROWS_TO_EVALUATE))*100,2)}%"
)

Percent of predictions that include correct class: 41.36%
Multi_guess discount score: 41.36%
Multi_positive_outcome discount score: 89.79%
Percent of non-preds: 48.43% 
Percent of wrong preds: 10.21%


In [89]:
df_test

Unnamed: 0,text,category,predicted_class,correct,correct_w_discount,correct_w_non_preds,top_n_preds
0,the mistake that leads to better brownies most...,FOOD & DRINK,[FOOD & DRINK],1,1,1,"{'FOOD & DRINK': 0.73, 'TASTE': 0.185, 'HEALTH..."
1,after the shooting at spu desolation consolati...,RELIGION,[],0,0,1,"{'HEALTHY LIVING': 0.124, 'IMPACT': 0.12, 'REL..."
2,gordon ramsay sued over unpaid wages by employ...,BUSINESS,[],0,0,1,"{'BUSINESS': 0.386, 'ENTERTAINMENT': 0.039, 'M..."
3,fire breaks out at mexico s top refinery 9 peo...,WORLD NEWS,[],0,0,1,"{'WORLD NEWS': 0.202, 'THE WORLDPOST': 0.135, ..."
4,20 ways to make your marriage stronger grooms ...,WEDDINGS,[WEDDINGS],1,1,1,"{'WEDDINGS': 0.703, 'DIVORCE': 0.121, 'STYLE &..."
...,...,...,...,...,...,...,...
6245,scott weiland died from toxic mix of drugs acc...,ENTERTAINMENT,[],0,0,1,"{'HEALTHY LIVING': 0.225, 'CRIME': 0.049, 'WEL..."
6246,15 reasons why october is the best time to vis...,TRAVEL,[TRAVEL],1,1,1,"{'TRAVEL': 0.891, 'HOME & LIVING': 0.016, 'WEL..."
6247,teen allegedly held captive is feeling a lot b...,CRIME,[],0,0,1,"{'CRIME': 0.548, 'BLACK VOICES': 0.054, 'WEIRD..."
6248,take a look at the best animal photos of the week,GREEN,[],0,0,1,"{'GREEN': 0.197, 'WEIRD NEWS': 0.134, 'ENVIRON..."
