# Train a Model from Hugging Face

## Install and Import Dependencies

In [1]:
# Uncomment to install the dependencies
# !pip install numpy pandas torch transformers pinecone

In [2]:
from warnings import filterwarnings

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from configs import (
  PINECONE_API_KEY, PINECONE_INDEX, TOXIC_NAMESPACE, BENIGN_NAMESPACE,
  PROMPT_TEMPLATE
)
from utils import PineconeInterface, Pipeline, get_device

filterwarnings("ignore")

## Connect to Pinecone client (if using few-shot learning)

In [3]:
# Number of examples for few-shot learning
num_examples = 0

pc = PineconeInterface(PINECONE_API_KEY, PINECONE_INDEX) if num_examples > 0 else None

## Import Model and Tokenizer

In [4]:
# Hugging Face checkpoint or local path to the model and tokenizer
checkpoint = "/Users/naman/Workspace/models-tokenizers/toxicity-classifiers/roberta-toxigen"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

## Send Model to Device

In [5]:
device = get_device()
print(f"Using {device} device")

model.to(device)

Using mps device


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Connect to Pinecone

In [6]:
pc = PineconeInterface(PINECONE_API_KEY, PINECONE_INDEX)

## Initialize Pipeline

In [7]:
pipeline = Pipeline(
  model,
  tokenizer,
  pc,
  TOXIC_NAMESPACE,
  BENIGN_NAMESPACE,
  PROMPT_TEMPLATE,
  device
)

## Inference

In [8]:
texts = ["I hate you shubham", "I love you shubham"]

pipeline(texts)

[0.9997259974479675, 0.9972090125083923]

In [9]:
pipeline(texts, num_samples=0)

[0.9966585636138916, 0.0015695166075602174]