# Train a Model from Hugging Face

## Install and Import Dependencies

In [9]:
from warnings import filterwarnings

import pandas as pd
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from configs import (
  TOXIC_DB_PATH, BENIGN_DB_PATH, PROMPT_TEMPLATE, EPOCHS, BATCH_SIZE,
  LEARNING_RATE, FACTOR, PATIENCE, THRESHOLD
)
from utils import DatabaseInterface, Trainer, get_device

filterwarnings("ignore")

## Initialize Database Interfaces

In [4]:
toxic_db = DatabaseInterface(n_neighbors=2, data_path=TOXIC_DB_PATH)
benign_db = DatabaseInterface(n_neighbors=2, data_path=BENIGN_DB_PATH)

## Import Model and Tokenizer from Hugging Face

In [5]:
# Hugging Face checkpoint or local path to the model and tokenizer
checkpoint = "s-nlp/roberta_toxicity_classifier"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at s-nlp/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Initialize Optimizer and Scheduler

In [6]:
optimizer = AdamW(
  model.parameters(),
  lr=LEARNING_RATE
)
scheduler = ReduceLROnPlateau(
  optimizer,
  mode="min",
  factor=FACTOR,
  patience=PATIENCE,
  threshold=THRESHOLD,
)

## Load Data

In [7]:
# Path to the csv file
data_path = "/Users/naman/Workspace/Data/Toxic-Content/toxigen-data/processed.csv"

df = pd.read_csv(data_path)
df.tail()

Unnamed: 0,text,label
8955,we are racist because we put them in their pla...,1.0
8956,"in order to work with mexican rebels, i think ...",0.466667
8957,I'd like to thank you.. I don't think I'd have...,0.2
8958,have you had your medication?,0.466667
8959,we are the ones who came to this country and m...,0.466667


In [None]:
threhshold = .3
texts, labels = df["text"].tolist(), df["label"].map(lambda x: 1 if x > threhshold else 0).tolist()
texts[-5:], labels[-5:]

(['we are racist because we put them in their place!\n- that is why we need to make them pay and take their jobs',
  'in order to work with mexican rebels, i think you will need to work on your spanish or ask your friend to help you\\n',
  "I'd like to thank you.. I don't think I'd have made it if I didn't find MGTOW! You guys are great!",
  'have you had your medication?',
  'we are the ones who came to this country and made it our home. we didnt come begging for a hand out and we sure didnt come here just'],
 [1, 0, 0, 0, 0])

### Train-Test Split

In [12]:
split_ind = int(len(texts) * .8)
train_texts, train_labels = texts[:split_ind], labels[:split_ind]
test_texts, test_labels = texts[split_ind:], labels[split_ind:]

## Initalize Trainer

In [10]:
trainer = Trainer(
  model=model,
  tokenizer=tokenizer,
  optimizer=optimizer,
  scheduler=scheduler,
  toxic_db=toxic_db,
  benign_db=benign_db,
  prompt_template=PROMPT_TEMPLATE
)

## Get Device and Train

In [None]:
device = get_device()
print(f"Using {device} device")

trainer.train(
  texts=train_texts,
  labels=train_labels,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  device=device
)

## Save Trained Model and Tokenizer

In [10]:
save_path = "/Users/naman/Workspace/models-tokenizers/toxicity-classifiers/roberta-toxigen"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/Users/naman/Workspace/models-tokenizers/toxic-classifiers/roberta-toxigen/tokenizer_config.json',
 '/Users/naman/Workspace/models-tokenizers/toxic-classifiers/roberta-toxigen/special_tokens_map.json',
 '/Users/naman/Workspace/models-tokenizers/toxic-classifiers/roberta-toxigen/vocab.json',
 '/Users/naman/Workspace/models-tokenizers/toxic-classifiers/roberta-toxigen/merges.txt',
 '/Users/naman/Workspace/models-tokenizers/toxic-classifiers/roberta-toxigen/added_tokens.json',
 '/Users/naman/Workspace/models-tokenizers/toxic-classifiers/roberta-toxigen/tokenizer.json')