# Train a Model from Hugging Face

## Install and Import Dependencies

In [1]:
# Uncomment to install the dependencies
# !pip install numpy pandas torch transformers pinecone

In [2]:
from warnings import filterwarnings

import pandas as pd
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from configs import (
  PINECONE_API_KEY, PINECONE_INDEX, TOXIC_NAMESPACE, BENIGN_NAMESPACE,
  EPOCHS, BATCH_SIZE, LEARNING_RATE, FACTOR, PATIENCE, THRESHOLD
)
from utils import PineconeInterface, Trainer

filterwarnings("ignore")

## Connect to Pinecone client (if using few-shot learning)

In [3]:
# Number of examples for few-shot learning
num_examples = 0

pc = PineconeInterface(PINECONE_API_KEY, PINECONE_INDEX) if num_examples > 0 else None

## Import Model and Tokenizer from Hugging Face

In [None]:
# Hugging Face checkpoint or local path to the model and tokenizer
checkpoint = "s-nlp/roberta_toxicity_classifier"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

## Initialize Optimizer and Scheduler

In [5]:
optimizer = AdamW(
  model.parameters(),
  lr=LEARNING_RATE
)
scheduler = ReduceLROnPlateau(
  optimizer,
  mode="min",
  factor=FACTOR,
  patience=PATIENCE,
  threshold=THRESHOLD,
)

## Load Data

In [None]:
# Path to the csv file
data_path = "/Users/naman/Workspace/Data/Toxic-Content/toxigen-data/processed.csv"

df = pd.read_csv(data_path)
df.tail()

In [None]:
texts, labels = df["text"].tolist(), df["label"].map(lambda x: 1 if x > .5 else 0).tolist()
texts[-5:], labels[-5:]

## Initalize Trainer

In [8]:
trainer = Trainer(
  model,
  tokenizer,
  optimizer,
  scheduler,
  pc,
  TOXIC_NAMESPACE,
  BENIGN_NAMESPACE
)

In [None]:
trainer.train(
  texts,
  labels,
  BATCH_SIZE,
  EPOCHS,
  num_examples
)

## Save Trained Model and Tokenizer

In [None]:
save_path = "/Users/naman/Workspace/models-tokenizers/toxic-classifiers/roberta-toxigen"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)