In [None]:
# Transformers installation
! pip install transformers datasets

In [None]:
from datasets import load_dataset

imdb = load_dataset("imdb")
imdb

In [None]:
imdb["test"][0]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
output = tokenizer.encode("Welcome to the 🤗 Tokenizers library. This is a tutorial for students")
print(tokenizer.convert_ids_to_tokens(output))

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
  
tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
BATCH_SIZE = 4

tf_train_set = tokenized_imdb["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
)

tf_validation_set = tokenized_imdb["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
)

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_epochs = 5
batches_per_epoch = len(tokenized_imdb["train"]) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
model.compile(optimizer=optimizer)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-imdb", from_pt=True)
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-imdb")

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model = model, tokenizer=tokenizer)

In [None]:
classifier('I really hate this game')