# Load and Preprocess Data

In [None]:
from helper import yuetal_data_preprocess

import torch
from transformers import RobertaTokenizer
from datasets import Dataset

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
def tokenize(batch):
    # return tokenizer(batch["text"], padding=True, truncation=True, add_special_tokens = True)
    return tokenizer.encode_plus(text=batch["target"],
                       text_pair=batch["context"],
                       add_special_tokens=True, 
                       return_tensors='pt', 
                       truncation=True, 
                       max_length=512, 
                       padding='max_length', 
                       return_attention_mask=True)

YU_DATA_PATH = '../reference/counter_context/data'

train_df = yuetal_data_preprocess(YU_DATA_PATH + '/gold/train.jsonl', 
                                        YU_DATA_PATH + '/silver/train.jsonl')
val_df = yuetal_data_preprocess(YU_DATA_PATH + '/gold/val.jsonl', 
                                YU_DATA_PATH + '/silver/val.jsonl')

In [None]:
print(train_df.iloc[2]["target"])
train_df.head()

In [None]:
import collections
import pandas as pd

y_train_counter = collections.Counter(train_df["label"])
print("y_train_counter = ", y_train_counter)

y_val_counter = collections.Counter(val_df["label"])
print("y_val_counter = ", y_val_counter)

y_val_counter[0] / (y_val_counter[0] + y_val_counter[1])

## Up-sample minority class

In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
majority_train_df = train_df[train_df.label==0]
minority_train_df = train_df[train_df.label==1]

# Upsample minority class
upsampled_minority_train_df = resample(minority_train_df, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(majority_train_df),    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
balanced_train_df = pd.concat([majority_train_df, upsampled_minority_train_df])
 
balanced_train_df.label.value_counts()

In [None]:
balanced_train_df.head()

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [None]:
train_ds

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, add_special_tokens = True)

train_encoded = train_ds.map(tokenize, batched=True, batch_size=None)
val_encoded = train_ds.map(tokenize, batched=True, batch_size=None)

In [None]:
train_encoded

In [None]:


def roberta_encode(df, tokenizer, max_seq_length=512):
    input_ids = []
    attention_masks = []
    for sent in df[['text']].values:
        # sent = sent[0] + ' [SEP] ' +  sent[1]
        encoded_dict = tokenizer.encode_plus(
			sent,                      # Sentence to encode.
			add_special_tokens = True, # Add '[CLS]' and '[SEP]'
			max_length = max_seq_length,           # Pad & truncate all sentences.
			pad_to_max_length = True,
			return_attention_mask = True,   # Construct attn. masks.
			return_tensors = 'pt',     # Return pytorch tensors.
		)
        
        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_word_ids': input_ids,
    'input_mask': attention_masks}

    return inputs

def binarize_label(label):
    if int(label) == 1:      #"Neutral" label combines with hate to form "not counter-hate"
        label = 0

    return int(label)

train = roberta_encode(train_df, tokenizer)
# train_df
# train_labels = train_df['label'].apply(binarize_label)

# val = roberta_encode(val_df, tokenizer)
# val_labels = val_df['label'].apply(binarize_label)

# test = roberta_encode(test_df, tokenizer)
# test_labels = test_df['label'].apply(binarize_label)

In [None]:



# train_df['text'] = texts
# print(train_df['text'][20].values)
print(texts[20])
train_df.iloc[20].text

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

# ds = DatasetDict()
# ds['train'] = load_dataset('json', data_files=data_path + '/gold/train.jsonl')
# ds['train']

In [None]:
# ds['train'] = ds['train'].rename_column('idx', 'input_ids')
# ds['train']

In [None]:
ds = DatasetDict()
for i in range(len(train["input_word_ids"])):
    train

In [None]:

print(tokenize(ds["train"][:2]))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

compute_metrics()

# Train

In [None]:
# hide_output
from transformers import AutoModelForSequenceClassification

model_ckpt = "roberta-base"
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

In [None]:
len(train_encoded)

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(train_encoded) // batch_size
print(logging_steps)
model_name = f"{model_ckpt}-finetuned-counter-hate"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False, 
                                  log_level="error")

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=train_encoded,
                  eval_dataset=val_encoded,
                  tokenizer=tokenizer)
trainer.train()

# Evaluate

In [None]:
import torch
import numpy as np

train_hidden = torch.load("train_hidden.pt")
val_hidden = torch.load("val_hidden.pt")
# y_pred = torch.load("y_preds.pt")

print(train_hidden)
train_hidden['input_ids']

In [None]:
train_hidden['input_ids'][0]

In [None]:
import collections
import pandas as pd

y_train_counter = collections.Counter(train_hidden["label"].numpy())
print("y_train_counter = ", y_train_counter)

y_val_counter = collections.Counter(val_hidden["label"].numpy())
print("y_val_counter = ", y_val_counter)

# y_pred_counter = collections.Counter(y_pred)
# print("y_pred_counter = ", y_pred_counter)
y_val_counter[0] / (y_val_counter[0] + y_val_counter[1])
# y_train_counter[0]

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
print(tokenizer.convert_ids_to_tokens(train_hidden['input_ids'][1]))
print(train_hidden['text'][1])

In [None]:
text1 = "Listen to this wisdom. [SEP] Where the Fuck did you get that up arrow?"
text2 = "Listen to this wisdom. </s> Where the Fuck did you get that up arrow?"

print(tokenizer.convert_ids_to_tokens(tokenizer(text1).input_ids))
print(tokenizer.convert_ids_to_tokens(tokenizer(text2).input_ids))

In [None]:
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T

In [None]:
tokenizer(train_hidden)

In [None]:
binary_y_pred = [0 if x == 1 else 2 for x in y_pred]

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score



def plot_confusion_matrix(y_preds, y_true, title):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=True)
    plt.title(title)
    plt.savefig(title)


train_hidden.features["label"]

y_valid = np.array(val_hidden["label"])
plot_confusion_matrix(binary_y_pred, y_valid, "confusion_matrix")

print(accuracy_score(y_valid, binary_y_pred, normalize=True))