## 4.1 Preprocessing

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, roc_curve
import torch
from transformers import BertweetTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments 
#Roberta model, Auto Tokenizer
from transformers import DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('hydrogen_small.csv')
df.info()

In [None]:
df["label"].unique()

In [None]:
df["text"].unique()

In [None]:
df["text"].iloc[18]

In [None]:
def clean_message(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
df["text"] = df["text"].apply(clean_message)

In [None]:
df["text"].iloc[18]

In [None]:
df["text"].unique()

In [None]:
df["label"] = df["label"].map({
    'Irrelevant': 0, # Negative = 0
    'Relevant': 1 # Positive = 1
})


In [None]:
df["label"].unique()

In [None]:
df[df["label"] == 0].head(5)

In [None]:
df["label"].value_counts()

## 4.2 Two pre-trained BERT models

In [None]:
X = df["text"].values
y = df["label"].values
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(X,y,
stratify=y, test_size=0.3, random_state=random_state)
print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

In [None]:
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

print("Train dataset:", train_ds)
print("Test dataset:", test_ds)


## 4.2.1 Bertweet Model (Vinai)

In [None]:
model_name1 = "vinai/bertweet-base"
tokenizer1 = BertweetTokenizer.from_pretrained(model_name1)

In [None]:
# Function that is applied to all samples in the dataset.

def tokenize_bertweet(batch):
 # We set truncation=True to truncate (cut off) messages that are too long.
 # NOTE: Not all models require this, you may get a warning indicating that it has no effect.
 # Padding is set to True if the model requires a fixed sequence length.
    return tokenizer1(batch['text'], truncation=True, padding=True)
# Apply to both the training and testing datasets.
# We set batched to True which can enable parallel processing, however on my machine I found
# it did not scale to a greater number of threads.
train_ds_bertweet = train_ds.map(tokenize_bertweet, batched=True)
test_ds_bertweet = test_ds.map(tokenize_bertweet, batched=True)

In [None]:
train_ds

In [None]:
# Ensure the resources for any existing model has been freed.
try:
    del model
except NameError:
    pass
# Download/load the base model. We use the "vinai/bertweet-base" model here.
# Set the number of labels to the number of unique labels in the dataframe, which is 2.
# Set the problem type to single label classification, since we want one class for each sample.
model1 = RobertaForSequenceClassification.from_pretrained(
    model_name1,
    num_labels=df["label"].nunique(),
    problem_type="single_label_classification")

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec, recall, f1, _ = precision_recall_fscore_support(
    labels, preds, average="binary", pos_label=1)
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": recall,
        "f1": f1
    }

In [None]:
EarlyStopping_model1 = RobertaForSequenceClassification.from_pretrained(
 model_name1,
 num_labels=df["label"].nunique(),
 problem_type="single_label_classification")
EarlyStopping_model1.train()
EarlyStopping_training_args = TrainingArguments(
 output_dir="./results",
 num_train_epochs=10,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=64,
 eval_strategy="epoch",
 save_strategy="epoch",
 learning_rate=1e-5,
 weight_decay=0.01,
 logging_dir="./logs",
 logging_steps=10,
 # Added for early stopping.
 metric_for_best_model = "loss",
 load_best_model_at_end = True
)
EarlyStopping_trainer1 = Trainer(
 model=EarlyStopping_model1,
 args=EarlyStopping_training_args,
 train_dataset=train_ds_bertweet,
 eval_dataset=test_ds_bertweet,
 processing_class=tokenizer1,
 data_collator=DataCollatorWithPadding(tokenizer1),
 compute_metrics=compute_metrics,
 callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)
EarlyStopping_trainer1.train()


In [None]:
# Switch the model to evaluation mode, disabling dropout etc layers.
model1.eval()
# Evaluate the datasets.
train_results_bertweet = EarlyStopping_trainer1.evaluate(train_ds_bertweet)
test_results_bertweet = EarlyStopping_trainer1.evaluate(test_ds_bertweet)

In [None]:
def display_evaluation(setname_bertweet, results_bertweet):
 print(f"{setname_bertweet} Set Accuracy:", round(results_bertweet["eval_accuracy"], 3))
 print(f"{setname_bertweet} Set Precision:", round(results_bertweet["eval_precision"], 3))
 print(f"{setname_bertweet} Set Recall:", round(results_bertweet["eval_recall"], 3))
 print(f"{setname_bertweet} Set F1 score:", round(results_bertweet["eval_f1"], 3))
display_evaluation("Training", train_results_bertweet)
display_evaluation("Testing", test_results_bertweet)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
 X, y, stratify=y, test_size=0.3, random_state=random_state)
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})
train_ds_bertweet = Dataset.from_pandas(train_df)
test_ds_bertweet = Dataset.from_pandas(test_df)
train_ds_bertweet = train_ds_bertweet.map(tokenize_bertweet, batched=True)
test_ds_bertweet = test_ds_bertweet.map(tokenize_bertweet, batched=True)
print("Training set size:", len(train_df))
print("Testing set size:", len(test_df))

## 4.2.2 Roberta

In [None]:
model_name2  = 'roberta-base'
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)

In [None]:
def tokenize_roberta(batch):
    return tokenizer2(batch['text'], truncation=True, padding=True)

train_ds_roberta = train_ds.map(tokenize_roberta, batched=True)
test_ds_roberta = test_ds.map(tokenize_roberta, batched=True)

In [None]:
train_ds

In [None]:
try:
    del model
except NameError:
    pass

In [None]:
model2 = AutoModelForSequenceClassification.from_pretrained(
    model_name2,
    num_labels=df["label"].nunique(),
    problem_type="single_label_classification")

In [None]:
EarlyStopping_model2 = AutoModelForSequenceClassification.from_pretrained(
 model_name2,
 num_labels=df["label"].nunique(),
 problem_type="single_label_classification")
EarlyStopping_model2.train()
EarlyStopping_training_args = TrainingArguments(
 output_dir="./results",
 num_train_epochs=10,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=64,
 eval_strategy="epoch",
 save_strategy="epoch",
 learning_rate=2e-5,
 weight_decay=0.01,
 logging_dir="./logs",
 logging_steps=10,
 # Added for early stopping.
 metric_for_best_model = "loss",
 load_best_model_at_end = True
)
EarlyStopping_trainer2 = Trainer(
 model=EarlyStopping_model2,
 args=EarlyStopping_training_args,
 train_dataset=train_ds_roberta,
 eval_dataset=test_ds_roberta,
 processing_class=tokenizer2
    ,
 data_collator=DataCollatorWithPadding(tokenizer2),
 compute_metrics=compute_metrics,
 callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)
EarlyStopping_trainer2.train()

In [None]:
model2.eval()
# Evaluate the datasets.
train_results_roberta = EarlyStopping_trainer2.evaluate(train_ds_roberta)
test_results_roberta = EarlyStopping_trainer2.evaluate(test_ds_roberta)

In [None]:
def display_evaluation(setname_roberta, results_roberta):
 print(f"{setname_roberta} Set Accuracy:", round(results_roberta["eval_accuracy"], 3))
 print(f"{setname_roberta} Set Precision:", round(results_roberta["eval_precision"], 3))
 print(f"{setname_roberta} Set Recall:", round(results_roberta["eval_recall"], 3))
 print(f"{setname_roberta} Set F1 score:", round(results_roberta["eval_f1"], 3))
display_evaluation("Training", train_results_roberta)
display_evaluation("Testing", test_results_roberta)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
 X, y, stratify=y, test_size=0.3, random_state=random_state)
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})
train_ds_roberta = Dataset.from_pandas(train_df)
test_ds_roberta = Dataset.from_pandas(test_df)
train_ds_roberta = train_ds_roberta.map(tokenize_roberta, batched=True)
test_ds_roberta = test_ds_roberta.map(tokenize_roberta, batched=True)
print("Training set size:", len(train_df))
print("Testing set size:", len(test_df))

In [None]:
# Returns (matrix, tokens)
def compute_attention_matrix(tokenizer, model, text):
    # Feed into the model, you could also grab the token embedding directly
    # from the dataset, in which case this step would be unnecessary. We want
    # the output in Tensor format that we can feed to the model, so we use
    # return_tensors="pt" (PyTorch Tensor). Lastly, send the tensor to
    # whichever device the model is located on. This is unnecessary if you
    # are running purely on the CPU, but needed for models on GPUs.
    tokens = tokenizer(text, return_tensors="pt").to(model.device)
    # We use torch.no_grad() to ensure the weights in the model are unchanged.
    with torch.no_grad():
        pred = model(**tokens, output_attentions=True)
    # Stack layers. Depending on your model, this may have no effect.
    # Move it back to the GPU if it was previously on the GPU.
    attentions = torch.stack(pred.attentions).cpu()
    # Remove the batch dimension, as there is only a zero value there.
    attentions = attentions.squeeze(1)
    # Average over the transformer layers and heads.
    attentions = attentions.mean(dim=0).mean(dim=0)
    # attentions now contains a matrix of importance from every token to every
    # other token. e.g. if the message contained 10 tokens, it would be 10x10.
    # Select the predicted class.
    pred_class = pred.logits.cpu().argmax(-1).item()
    # Also return a string representation of the tokens in the message.
    # Plotting the integer token IDs would not be very meaningful. 
    token_strs = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]) 
    return (attentions, pred_class, token_strs)


In [None]:
def plot_attention(attentions, tokens, title):
 # Enlarge figure to take up more of the width.
 plt.figure(figsize=(10, 8))
 plt.title(title)
 # Plot heatmap.
 sns.heatmap(
 attentions, # Plot our attention matrix.
 xticklabels=tokens, # Display token names on X axis.
 yticklabels=tokens, # Display token names on Y axis.
 cmap='binary', # Black for low, white for high
 cbar=True # Display colour bar.
 )
 
 plt.show()

In [None]:
def display_attention_matrix(tokenizer, model, text, model_name="Model"):
 attention, pred_class, tokens = compute_attention_matrix(tokenizer, model, text)
 pred_label = "Positive" if pred_class == 1 else "Negative"
 title=f"{model_name}\n{text}\nPredicted class: {pred_label}"
 plot_attention(attention, tokens, title)

In [None]:
display_attention_matrix(tokenizer1, model1, df[df["label"] == 0].iloc[18]["text"], model_name="BERTweet Model")
display_attention_matrix(tokenizer2, model2, df[df["label"] == 0].iloc[18]["text"], model_name="RoBERTa Model")

In [None]:
display_attention_matrix(tokenizer1, model1, df[df["label"] == 0].iloc[72]["text"], model_name="BERTweet Model")
display_attention_matrix(tokenizer2, model2, df[df["label"] == 0].iloc[72]["text"], model_name="RoBERTa Model")

In [None]:
pred_bertweet = EarlyStopping_trainer1.predict(test_ds_bertweet)
pred_roberta = EarlyStopping_trainer2.predict(test_ds_roberta)

In [None]:
# Convert to PyTorch tensor, apply softmax, and convert back to a numpy array.
pred_probs_bertweet = torch.nn.functional.softmax(torch.Tensor(pred_bertweet.predictions)).numpy()
pred_probs_roberta = torch.nn.functional.softmax(torch.Tensor(pred_roberta.predictions)).numpy()

In [None]:
# Compute the ROC index. Recall y_test contains our original labels for the testing set.
roc_index_bertweet = roc_auc_score(y_test, pred_probs_bertweet[:, 1])
roc_index_roberta = roc_auc_score(y_test, pred_probs_roberta[:, 1])
# Compute the ROC curve.
fpr_bertweet,tpr_bertweet, thresholds_bertweet = roc_curve(y_test, pred_probs_bertweet[:,1])
fpr_roberta,tpr_roberta, thresholds_roberta = roc_curve(y_test, pred_probs_roberta[:,1])
# And plot it on a line graph, similarly to what we did in previous weeks.
plt.plot(fpr_bertweet, tpr_bertweet, label="BERTweet Model: {:.3f}".format(roc_index_bertweet),
color='red', lw=0.5)
plt.plot(fpr_roberta, tpr_roberta, label="RoBerta Model: {:.3f}".format(roc_index_roberta),
color='navy', lw=0.5)
plt.plot([0, 1], [0, 1], color='black', lw=0.5, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic for positive sentiment")
plt.legend(loc="lower right")
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

tfidf_df = pd.read_csv('tfidf_features_small.csv')
tfidf_df.info()

In [None]:
X = tfidf_df

df= pd.read_csv("hydrogen_small.csv")
y= df['label'].values

random_state = 42
test_set_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=test_set_size, stratify=y, 
                                                    random_state=random_state)
model = LogisticRegression(random_state=random_state)

model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
# training and test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

# classification report on test data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
label_map = {'Irrelevant': 0, 'Relevant': 1}
y_test_bin = np.array([label_map[y] for y in y_test])

y_pred_proba = model.predict_proba(X_test)[:, 1]

# ROC/AUC
fpr, tpr, thresholds = roc_curve(y_test_bin, y_pred_proba) 
roc_auc = roc_auc_score(y_test_bin, y_pred_proba)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'ROC (AUC={roc_auc:.2f})')
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8,6))

# BERTweet ROC
plt.plot(fpr_bertweet, tpr_bertweet,
         label=f'BERTweet Model (AUC = {roc_index_bertweet:.3f})',
         color='red', lw=1.5)
#Roberta ROV
plt.plot(fpr_roberta, tpr_roberta,
         label=f'Roberta Model (AUC = {roc_index_roberta:.3f})',
         color='blue', lw=1.5)

plt.plot(fpr, tpr,
         label=f'Logistic Regression (AUC = {roc_auc:.3f})',
         color='black', lw=1.5)

plt.plot([0,1], [0,1], color='grey', linestyle='--', lw=1)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("ROC Curve Comparison: BERTweet vs Logistic Regression")

plt.legend(loc="lower right")

plt.show()