# Train a Model from Hugging Face

## Install and Import Dependencies

In [1]:
# Uncomment to install the dependencies
# !pip install numpy pandas torch transformers pinecone

In [2]:
from warnings import filterwarnings

import pandas as pd
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from configs import (
  PINECONE_API_KEY, PINECONE_INDEX, TOXIC_NAMESPACE, BENIGN_NAMESPACE,
  EPOCHS, BATCH_SIZE, LEARNING_RATE, FACTOR, PATIENCE, THRESHOLD
)
from utils import PineconeInterface, Trainer, get_device

filterwarnings("ignore")

## Connect to Pinecone client (if using few-shot learning)

In [3]:
# Number of examples for few-shot learning
num_examples = 0

pc = PineconeInterface(PINECONE_API_KEY, PINECONE_INDEX) if num_examples > 0 else None

## Import Model and Tokenizer from Hugging Face

In [None]:
# Hugging Face checkpoint or local path to the model and tokenizer
checkpoint = "s-nlp/roberta_toxicity_classifier"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

## Initialize Optimizer and Scheduler

In [5]:
optimizer = AdamW(
  model.parameters(),
  lr=LEARNING_RATE
)
scheduler = ReduceLROnPlateau(
  optimizer,
  mode="min",
  factor=FACTOR,
  patience=PATIENCE,
  threshold=THRESHOLD,
)

## Load Data

In [6]:
# Path to the csv file
data_path = "/Users/naman/Workspace/Data/Toxic-Content/toxigen-data/processed.csv"

df = pd.read_csv(data_path)
df.tail()

Unnamed: 0,text,label
8955,we are racist because we put them in their pla...,1.0
8956,"in order to work with mexican rebels, i think ...",0.466667
8957,I'd like to thank you.. I don't think I'd have...,0.2
8958,have you had your medication?,0.466667
8959,we are the ones who came to this country and m...,0.466667


In [7]:
texts, labels = df["text"].tolist(), df["label"].map(lambda x: 1 if x > .5 else 0).tolist()
texts[-5:], labels[-5:]

(['we are racist because we put them in their place!\n- that is why we need to make them pay and take their jobs',
  'in order to work with mexican rebels, i think you will need to work on your spanish or ask your friend to help you\\n',
  "I'd like to thank you.. I don't think I'd have made it if I didn't find MGTOW! You guys are great!",
  'have you had your medication?',
  'we are the ones who came to this country and made it our home. we didnt come begging for a hand out and we sure didnt come here just'],
 [1, 0, 0, 0, 0])

## Initalize Trainer

In [8]:
trainer = Trainer(
  model,
  tokenizer,
  optimizer,
  scheduler,
  pc,
  TOXIC_NAMESPACE,
  BENIGN_NAMESPACE
)

## Get Device and Train

In [9]:
device = get_device()
print(f"Using {device} device")

trainer.train(
  texts,
  labels,
  BATCH_SIZE,
  EPOCHS,
  num_examples
)

Using mps device


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch [1] Batch [6/1120] Loss [ 0.6615]

KeyboardInterrupt: 

## Save Trained Model and Tokenizer

In [None]:
save_path = "/Users/naman/Workspace/models-tokenizers/toxic-classifiers/roberta-toxigen"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)