In [1]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!nvidia-smi

Mon May 22 19:44:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# Install Libraries
%%capture
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets
!pip install tweet-preprocessor

In [66]:
# set data paths
import os
import sys
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TrainingArguments, Trainer

In [34]:
# set data paths
device = 'cuda' if torch.cuda.is_available() else 'cpu'

root_dir = "gdrive/My Drive/Master_Thesis/"
test_dir = os.path.join(root_dir, 'data/Custom/CONAN_test.csv')
test_small_dir = os.path.join(root_dir, 'data/Custom/T8-S10.csv')
save_dir = os.path.join(root_dir, 'predictions/')

In [105]:
df_dir = os.path.join(root_dir, 'data/Custom/hate_counter_tags.csv')
test_dir = os.path.join(root_dir, 'predictions/bart-large_18,05,2023--19,18_TEST_18,05,2023--21,25.csv')
test_save = os.path.join(root_dir, 'models')

In [73]:
df_raw = pd.read_csv(df_dir)
counter = df_raw['Counter_Speech']
hate = df_raw['Hate_Speech']

In [74]:
counter = pd.DataFrame(counter)
counter = counter.drop_duplicates()
counter = counter.rename(columns={"Counter_Speech": "text"})
counter['label'] = 1

hate = pd.DataFrame(hate)
hate = hate.drop_duplicates()
hate = hate.rename(columns={"Hate_Speech": "text"})
hate['label'] = 0

In [81]:
df_new = pd.concat([counter, hate], sort=False)
ds = Dataset.from_pandas(df_new)
dataset = ds.train_test_split(test_size=0.2)

In [69]:
tokenizer = AutoTokenizer.from_pretrained("ThinkCERCA/counterargument_hugging")

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Some layers from the model checkpoint at ThinkCERCA/counterargument_hugging were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ThinkCERCA/counterargument_hugging.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [82]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [83]:
tokenized_ds = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/9954 [00:00<?, ? examples/s]

Map:   0%|          | 0/2489 [00:00<?, ? examples/s]

In [84]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

id2label = {0: "non-counter-argument", 1: "counter-argument"}
label2id = {"non-counter-argument": 0, "counter-argument": 1}

from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [85]:
model = TFAutoModelForSequenceClassification.from_pretrained("ThinkCERCA/counterargument_hugging", num_labels=2, id2label=id2label, label2id=label2id)
model.resize_token_embeddings(len(tokenizer))

Some layers from the model checkpoint at ThinkCERCA/counterargument_hugging were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ThinkCERCA/counterargument_hugging.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


<tf.Variable 'tf_bert_for_sequence_classification_10/bert/embeddings/word_embeddings/weight:0' shape=(30522, 768) dtype=float32, numpy=
array([[-0.01018257, -0.06154883, -0.02649689, ..., -0.01985357,
        -0.03720997, -0.00975152],
       [-0.01170495, -0.06002603, -0.03233192, ..., -0.01681456,
        -0.04009988, -0.0106634 ],
       [-0.01975381, -0.06273633, -0.03262176, ..., -0.01650258,
        -0.04198876, -0.00323178],
       ...,
       [-0.02176224, -0.0556396 , -0.01346345, ..., -0.00432698,
        -0.0151355 , -0.02489496],
       [-0.04617237, -0.05647721, -0.00192082, ...,  0.01568751,
        -0.01387033, -0.00945213],
       [ 0.00145601, -0.08208051, -0.01597912, ..., -0.00811687,
        -0.04746607,  0.07527421]], dtype=float32)>

In [86]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_ds["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_ds["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [87]:
import tensorflow as tf

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [89]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4f005b27d0>

In [100]:
from tqdm import tqdm
import statistics

def argument_type(df):
#   model = AutoModelForSequenceClassification.from_pretrained("ThinkCERCA/counterargument_hugging", from_tf=True).to(device)
#   tokenizer = AutoTokenizer.from_pretrained("ThinkCERCA/counterargument_hugging")
  #model = AutoModelForSequenceClassification.from_pretrained("chkla/roberta-argument").to(device)
  #tokenizer = AutoTokenizer.from_pretrained("chkla/roberta-argument")

  #if tokenizer.pad_token is None:
   #   tokenizer.add_special_tokens({'pad_token': '[PAD]'})
   #   model.resize_token_embeddings(len(tokenizer))

  print('-'*80)
  print('Calculating Argument Type of predictions')

  results = []
  bs = 8
  max_gen_le = 256
  threshold = 0.95
  soft = True

  # prepare the input
  inputs = df["text"].tolist()
  # Batch Evaluation
  for i in tqdm(range(0, len(inputs), bs)):
    batch = tokenizer(
        inputs[i:i + bs],
        return_tensors='tf',
        padding=True)
        
    with torch.inference_mode():
      logits = model(**batch).logits
      if soft:
        result = tf.nn.softmax(logits, -1)[:, 1].cpu().numpy()
      else:
        result = (logits[:, 1] > threshold).cpu().numpy()
    
    results.extend([1 - item for item in result])
    average = statistics.fmean(results)

  return results, average

In [101]:
_, counter_average = argument_type(counter)
_, hate_average = argument_type(hate)

--------------------------------------------------------------------------------
Calculating Argument Type of predictions


Instructions for updating:
Use tf.identity with explicit device placement instead.
100%|██████████| 996/996 [03:09<00:00,  5.26it/s]


--------------------------------------------------------------------------------
Calculating Argument Type of predictions


100%|██████████| 560/560 [01:44<00:00,  5.35it/s]


In [102]:
counter_average

0.01054696829499705

In [104]:
hate_average

0.9794917911407902

In [123]:
test_raw = pd.read_csv(test_dir)
prediction = test_raw['Prediction']
prediction = pd.DataFrame(prediction)
prediction = prediction.rename(columns={"Prediction": "text"})

for idx, pred in enumerate(prediction["text"].tolist()):
  if pred != pred:
    prediction.loc[idx, "text"] = "None"

In [124]:
_, pred_average = argument_type(prediction)

--------------------------------------------------------------------------------
Calculating Argument Type of predictions


100%|██████████| 299/299 [00:55<00:00,  5.34it/s]


In [125]:
pred_average

0.03243563149043586

In [126]:
from datetime import datetime

def get_datetime(format):
    # datetime object containing current date and time
    now = datetime.now()
    # dd/mm/YY H:M:S
    dt = now.strftime(format)
    return dt
    
save_directory = os.path.join(root_dir, 'models/')
model_name = "counter-speech classifier"
pt_save_directory = os.path.join(save_directory, model_name)
pt_save_directory = os.path.join(pt_save_directory, get_datetime("%d,%m,%Y--%H,%M"))

tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)