# BERT Model
BERT is a bidirectional neural network for NLP. Here we will also explore the Hugging Face library to help us load a pretrained BERT model and perform fine tuning as well as performing NLP tasks such as text classification or text generation

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import *

# Dataset
We create a dataset to determine if a text is spam or ham

In [None]:
df = pd.read_csv("./data/spam.csv", encoding="latin1")

In [None]:
df = df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])
df = df.rename(columns={"v1":"label", "v2":"text"})

In [None]:
df['text'].values

In [None]:
df['label'].value_counts()

In [None]:
df['label'] = df['label'].apply(lambda x : 0 if x == "ham" else 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, train_size=0.8, random_state=42)

In [None]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [None]:
train = Dataset.from_pandas(train, split="train")
train = train.remove_columns("__index_level_0__")
train

In [None]:
test = Dataset.from_pandas(test, split="test")
test = test.remove_columns("__index_level_0__")

test[0]

# BERT from Hugging Face
We use a pretrained BERT model that we will fine tune using our dataset. BERT models and tools provided from Hugging Face

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
tokenizer

In [None]:
tokenizer([test['text'][0]])

In [None]:
def preprocessor(sentences):
    return tokenizer(sentences['text'], truncation=True)

In [None]:
preprocessed_train = train.map(preprocessor, batched=True)

In [None]:
preprocessed_train

In [None]:
preprocessed_test = test.map(preprocessor, batched=True)

In [None]:
preprocessed_test

In [None]:
id2label = {0:"ham", 1:"spam"}
label2id = {"ham":0, "spam":1}

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)
data_collator

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, id2label = id2label, label2id = label2id)

In [None]:
import evaluate

accuracy = evaluate.load('accuracy')

In [None]:
from transformers import Trainer, TrainingArguments

train_args = TrainingArguments(
    output_dir="./model/bert",
    evaluation_strategy="epoch",
    num_train_epochs=2,
)

trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset=preprocessed_train,
    eval_dataset=preprocessed_test,
    tokenizer = tokenizer,
    data_collator=data_collator,
)

## Fine Tuning
We will fine tune our model here

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Model saving and loading
# model.save_pretrained("model")
# tokenizer.save_pretrained("model")

## Prediction
Using Hugging Face's pipeline, we can easily create a pipeline for NLP tasks using pipeline. We then can provide the model and tokenizer that has been fine tuned

In [None]:
from transformers import pipeline
# Load from our saved model and tokenizer
model_load = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer_load = AutoTokenizer.from_pretrained("model")
predictor = pipeline(task="text-classification", model=model_load, tokenizer=tokenizer_load)

In [None]:
predictor("Order Deliveries")