In [1]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!nvidia-smi

Sun Jun 11 19:53:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    26W /  70W |   3845MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# Install Libraries
%%capture
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets
!pip install tweet-preprocessor

In [4]:
# set data paths
import os
import sys
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM

In [5]:
# set data paths
device = 'cuda' if torch.cuda.is_available() else 'cpu'

root_dir = "gdrive/My Drive/Master_Thesis/"
data_dir = os.path.join(root_dir, 'data/Custom/')

CONAN_test_dir = os.path.join(data_dir, 'CONAN_test.csv') # For base comparison
CONAN_test_small_dir = os.path.join(data_dir, 'T8-S10.csv') # For comparison with ChatGPT
EDOS_test_dir = os.path.join(data_dir, 'EDOS_sexist.csv') # For final comparison, sexism only test set

save_dir = os.path.join(root_dir, 'predictions/')

In [6]:
# import utilities
sys.path.append(os.path.join("/content/", root_dir))

from utilities import get_model_path, cleanup, save_prediction
from prediction import prepare_input, predict, post_processing, prepare_input_category

In [7]:
# choose model

gpt2_small = {
  "model_type": "GPT",
  "load_model_name": "gpt2",
  "version": "11,03,2023-16,02"
}

gpt2_medium = {
  "model_type": "GPT",
  "load_model_name": "gpt2-medium-category",
  "version": "11,06,2023--00,31"
}

gpt2_medium_category = {
  "model_type": "GPT",
  "load_model_name": "gpt2-medium",
  "version": "16,05,2023-21,09"
}

bart_small = {
  "model_type": "BART",
  "load_model_name": "bart",
  "version": "17,03,2023-22,54"
}

bart_large = {
  "model_type": "BART",
  "load_model_name": "bart-large",
  "version": "18,05,2023--19,18"
}


models = [gpt2_medium]
datasets = [
    'Base', 
    'Small', 
    # 'Sexism'
    ]

In [8]:
# load local models
def load_model(model_path, model_type):
  if model_type == "GPT":
    model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.padding_side = "left" 
  elif model_type == "BART":
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

  tokenizer.pad_token = tokenizer.eos_token # to avoid an error
  cleanup()

  return model, tokenizer

In [9]:
# Read csv file into dataframe
types = ["MIGRANTS", "POC", "LGBT+", "MUSLIMS", "WOMEN", "JEWS", "other", "DISABLED"]

def load_dataset(dataset_name, model_type, model_name):
  if dataset_name == 'Base':
    df_raw = pd.read_csv(CONAN_test_dir)
  elif dataset_name == 'Small':
    df_raw = pd.read_csv(CONAN_test_small_dir)
  elif dataset_name == 'Sexism':
    df_raw = pd.read_csv(EDOS_test_dir)
    df_raw = df_raw.rename(columns={"text": "Hate_Speech"})
    df_raw["Target"] = df_raw["labels"].map(lambda x: "WOMEN" if x == "sexist" else "other")
    df_raw["Target_2"] = "None"

  # TODO: use categoty function to prepare inputs
  if model_name == 'gpt2-medium-category':
    df = prepare_input_category(df_raw, types)
  else:
    df = prepare_input(df_raw, model_type)
  ds = Dataset.from_pandas(df)

  return df, ds

In [10]:
# Predict
for m in models:
  model_path = get_model_path(root_dir, m["load_model_name"], m["version"])
  model, tokenizer = load_model(model_path, m["model_type"])
  for dataset in datasets:
    df, ds = load_dataset(dataset, m["model_type"], m["load_model_name"])
    df["Prediction"] = predict(
        ds,
        model, 
        tokenizer, 
        batchsize=16, 
        max_gen_len=128,
        model_type=m["model_type"], 
        num_beams=3, 
        no_repeat_ngram_size=3, 
        num_return_sequences=1
        )
    cleanup()
    # Post Processing
    df = post_processing(df, m["model_type"])
    cleanup() 

    # Save predictions
    save_prediction(df, save_dir, m["load_model_name"], m["version"], dataset)
    cleanup()   


  0%|          | 0/150 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/5 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
