# Text Classification 

In [None]:
# Datasets is a library that allows easy access to different datasets.
# These could be NLP tasks or computer vision or audio
from datasets import list_datasets
import pandas as pd
import matplotlib.pyplot as plt
import transformers
import re
import torch

#list_datasets()
pd.set_option('max_colwidth', 250)

In [None]:
# I will be using the IMDB dataset
from datasets import load_dataset

imdb = load_dataset('imdb')
imdb

# I can see that this is stored as a dataset dict, which is similar to a python dictionary.

# Each key corresponds to a different split. These are train, test, and unsupervised.

In [None]:
imdb['train'][0]
# From the zero review I can tell that the rview was negative 

In [None]:
# Load the first three entries from the test split
# I can also see that all of these are all negative 
imdb['test'][:3]

In [None]:
# Normally I would like to work with the entire training dataset. But training this-
# would take over two hours. 

# To help reduce this time I will reduce the dataset to only 2000 entries.
imdb['train'] = imdb['train'].shuffle(seed=1).select(range(2000))
imdb['train']

In [None]:
# I'm going to take 1,600 of them for the training dataset and put 400 towards a validation-
# dataset

# The reason I want a validation dataset is that it will help me get an idea-
# of how well the model is training. 
imdb_train_validation = imdb['train'].train_test_split(train_size=0.8)
imdb_train_validation

In [None]:
imdb_train_validation['test']

In [None]:
# I can now create my validation dataset
imdb_train_validation['validation'] = imdb_train_validation.pop('test')
imdb_train_validation

In [None]:
# Now, because the dataset dict is like a python dictionary, I can use my newly formed-
# IMDB train validation dataset dict and update the IMDB dataset dict with it. 

# This means that I will overwrite any current splits or keys with the same name.
imdb.update(imdb_train_validation)
imdb

In [None]:
# I will now reduce the test set so that it has around 400 entries
imdb['test'] = imdb['test'].shuffle(seed=1).select(range(400))

In [None]:
# Since the unsupervised values are not useful, I will delete them 
imdb.pop('unsupervised')

In [None]:
# One of the nice things about the huggingFace lib is that I can convert it to pandas-
# so that I can visualize the dataset 
imdb.set_format('pandas')

df = imdb['train'][:]
df.sample(frac=1, random_state=1).head(10)

In [None]:
# Looking at the 1st review
df.loc[0, 'text']

In [None]:
# Sometimes the text has html tags and I want to remove this from the dataset
df['text'] = df.text.str.replace('<br />', '')
df.loc[0, 'text']

In [None]:
# I want to make sure that I have a balanced dataset. 
# This means that i want to have a similar ratio between positive and negative reviews 
df.label.value_counts()

In [None]:
# Here I will create a boxplot to see if I can see any patterns for whether reviews-
# are labeled as a 0 or a 1.

# I can see that I have a similar distribution for both

from turtle import color


df["Words per review"] = df['text'].str.split().apply(len)
df.boxplot("Words per review", by="label", grid=False, showfliers=False, color='black')

plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
# Looking at reviews that are less than 200 characters long
# 0 = nagative | 1 = positive 
df[df.text.str.len() < 200]

In [None]:
# Resetting the dataset back to the original form
imdb.reset_format()

### Tokenizer

* The next thing I want to do is tokenize the text so that I can convert the reviews from words to IDs. 

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Creating a tokenize function and pass in an argument to the datasets map method
def tokenize_function(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

# The map method then applies the function to each element in the dataset.
# imdb_encoded is now the dataset
imdb_encoded = imdb.map(tokenize_function, batched=True, batch_size=None)
imdb_encoded

In [None]:
print(imdb_encoded['train'][0])

# Now that I have the tokenized dataset, I can start to train my model

### Tiny IMDB

* Now that I have a tokenized dataset, I will pass it through a BERT model.

In [None]:
# This will show me the options I have for the AutoModel
[x for x in dir(transformers) if re.search(r'^AutoModel', x)]

Since this is a text classification problem, this falls under the 'AutoModelForSequenceClassification' model. What I'm doing here is adding a classification head on top of the pre-trained model with two classes. 

I will then be training this classification head as it will initially have random values. 

What's particularly helpful is that the AutoModel has a 'from_pretrain' method to load the weights of a pre-trained model. 

In [None]:
from transformers import AutoModelForSequenceClassification

# This line means that if I have a hardware accelerator, like a GPU, I want to-
# use that rather than a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# I will now specify that I have two labels in my dataset. 
num_labels = 2

model = (AutoModelForSequenceClassification
        .from_pretrained(checkpoint, num_labels=num_labels)
        .to(device))

When I'm training a model, I like to take a really small sample of the data, train with that, and see if I'm getting the output that I expect.

If I'm happy with that then I go ahead and start the training process.

In [None]:
# I will use the IMDB dataset that I've been using and create what I will call -
# the tiny_imdb

from datasets import DatasetDict

tiny_imdb = DatasetDict()

# This will only have 50 examples to train from and 10 each for the validation and test split.
tiny_imdb['train'] = imdb['train'].shuffle(seed=1).select(range(50))
tiny_imdb['validation'] = imdb['validation'].shuffle(seed=1).select(range(10))
tiny_imdb['test'] = imdb['test'].shuffle(seed=1).select(range(10))

# I'll then go ahead and encode the dataset 
tiny_imdb_encoded = tiny_imdb.map(tokenize_function, batched=True, batch_size=None)
tiny_imdb_encoded

In [None]:
# The hugging face library makes it easy to train a model using the Trainer, and TrainingArguments class. 
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(tiny_imdb_encoded['train']) // batch_size
model_name = f"{checkpoint}-finetuned-tiny-imdb"

# I can specify the training parameters 
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  optim="adamw_torch",)

training_args 

In [None]:
# Start training the model
torch.cuda.empty_cache()

trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=tiny_imdb_encoded['train'],
                  eval_dataset=tiny_imdb_encoded['validation'],
                  tokenizer=tokenizer)

trainer.train();
# I want to see a column showing the accuracy.

In [None]:
preds = trainer.predict(tiny_imdb_encoded['test'])
preds

In [None]:
preds.predictions.shape

In [None]:
# I will now extract only the predictions component from the preds
# The output of this is a tensor. 

preds.predictions.argmax(axis=-1)
# These are all of the predictions from my model. 

In [None]:
# I also have the actual labels, I can also grab those here. 
preds.label_ids

In [None]:
# Since I have both the predictions and the actual labels, I can use-
# the accuracy_score function from scikit-learn 
from sklearn.metrics import accuracy_score

accuracy_score(preds.label_ids, preds.predictions.argmax(axis=-1))

In [None]:
# Create a function named get_accuracy
# This will return a dictionary which includes the accuracy.
def get_accuracy(preds):
    predictions = preds.predictions.argmax(axis=-1)
    labels = preds.label_ids
    accuracy = accuracy_score(preds.label_ids, preds.predictions.argmax(axis=-1))
    return {'accuracy': accuracy}


In [None]:
torch.cuda.empty_cache()

trainer = Trainer(model=model,
                  compute_metrics=get_accuracy,
                  args=training_args,
                  train_dataset=tiny_imdb_encoded['train'],
                  eval_dataset=tiny_imdb_encoded['validation'],
                  tokenizer=tokenizer)

trainer.train();

### Test Run

In [None]:
batch_size = 8
logging_steps = len(imdb_encoded['train']) // batch_size
model_name = f"{checkpoint}-finetuned-imdb"

# I can specify the training parameters 
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  optim="adamw_torch",)

In [None]:
# This will take 2 hours on this machine
torch.cuda.empty_cache()

trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=get_accuracy,
                  train_dataset=imdb_encoded['train'],
                  eval_dataset=imdb_encoded['validation'],
                  tokenizer=tokenizer)

trainer.train();

In [None]:
# Save model
trainer.save_model()

In [None]:
# Testing out the model
from transformers import pipeline

classifier = pipeline('text-classification', model=model_name)
classifier('This is not my idea of fun')

In [None]:
classifier('This was an amazing experience')