In [1]:
import pandas as pd
import numpy as np
import os

from transformers import AutoTokenizer
from datasets import load_dataset, Dataset

# os.environ['CUDA_VIDIBLE_DEVICES'] = '0'

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [3]:
train_df = load_data('train', columns=['text', 'stars'])
valid_df = load_data('valid', columns=['text', 'stars'])
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'])

select [text, stars] columns from the train split
Success
select [text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


In [4]:
# Prepare the data.
# As an example, we only use the text data.
x_train = train_df['text']
y_train = train_df['stars']

x_valid = valid_df['text']
y_valid = valid_df['stars']

x_test = test_df['text']

x_train_processed = pd.DataFrame({'text':x_train, 'label':y_train})
x_valid_processed = pd.DataFrame({'text':x_valid, 'label':y_valid})
# x_valid_processed.to_csv('data_processed/train.csv', index=None)
# x_valid_processed .to_csv('data_processed/valid.csv', index=None)
train_dataset = Dataset.from_pandas(x_train_processed)
valid_dataset = Dataset.from_pandas(x_valid_processed)

In [5]:
import numpy as np

np.unique(y_valid.to_list())

array([1, 2, 3, 4, 5])

In [6]:
# data_files = {
#     'train': 'data_processed/train.csv',
#     'valid': 'data_processed/valid.csv'
# }
# dataset = load_dataset('csv', data_files=data_files)


In [7]:
train_dataset[:5]

{'text': ["I've been here a handful of times now and I've never been disappointed.  The food is always good and the servers are quick.   So far my two favorite items are the Peppersauce Burger with pastrami and the Peppersauce Patty.  Even as I type this my mouth is watering and I just had the Peppersauce Burger.  \n\nThe burgers are well done and still juicy!  I always leave stuffed and happy.  The burgers can be a little on the greasy side, need two or three napkins.  I've also had them when you only needed on napkin to clean up.  Either way it was still tasty!\n\nI've seen a couple of people get salads and they are huge and look good.\n\nThe servers have always been friendly even when it was really busy.",
  'The service was terrible. The food was just ok. Dessert was the best part of the whole experience.',
  'Alil pricey for the location but completly get the bang for your buck sweet fries on point 100%',
  "Don't get your car washed here. Paid 11 and my car came out covered in so

## Start to Load the model

In this pipeline, I try to use bert models from HuggingFace to do the test classification task

In [8]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)


In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

HTTPError: 500 Server Error: Internal Server Error for url: https://huggingface.co/bert-base-cased/resolve/main/vocab.txt

## Fine-Tune with pytorch

In [None]:
train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
train_dataset_tokenized = train_dataset_tokenized.remove_columns(['text'])
train_dataset_tokenized = train_dataset_tokenized.rename_column("label", "labels")
train_dataset_tokenized.set_format('torch')

valid_dataset_tokenized = valid_dataset.map(tokenize_function, batched=True)
valid_dataset_tokenized = valid_dataset_tokenized.remove_columns(['text'])
valid_dataset_tokenized = valid_dataset_tokenized.rename_column("label", "labels")
valid_dataset_tokenized.set_format('torch')


In [None]:
small_train_dataset = train_dataset_tokenized.shuffle(seed=42).select(range(500))
small_valid_dataset = valid_dataset_tokenized.shuffle(seed=42).select(range(200))

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler


In [None]:
# del model
# del pytorch_model
# del trainer
# torch.cuda.empty_cache()

In [None]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=4)
valid_dataloader = DataLoader(small_valid_dataset, batch_size=4)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased",num_labels=5)
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name='linear', optimizer=optimizer,
                             num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [None]:
from tqdm.auto import tqdm

In [None]:
model.train()
for epoch in tqdm(range(num_epochs)):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

In [None]:
len(train_dataset_tokenized)

In [None]:
for batch in train_dataloader:
    print(batch)
    break

## Fine Tune with Trainer

In [None]:
from datasets import load_metric
import numpy as np
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=5)

training_args = TrainingArguments(output_dir="test_trainer")

metric = load_metric("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    output_dir="test_trainer", evaluation_strategy="epoch")


In [None]:
train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
valid_dataset_tokenized = valid_dataset.map(tokenize_function, batched=True)
small_train_dataset = train_dataset_tokenized.shuffle(
    seed=42).select(range(1000))
small_valid_dataset = valid_dataset_tokenized.shuffle(
    seed=42).select(range(200))


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_valid_dataset,
    compute_metrics=compute_metrics,
    
)


In [None]:
trainer.train()


## Fine Tune with TF

In [None]:
train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
valid_dataset_tokenized = valid_dataset.map(tokenize_function, batched=True)
small_train_dataset = train_dataset_tokenized.shuffle(
    seed=42).select(range(1000))
small_valid_dataset = valid_dataset_tokenized.shuffle(
    seed=42).select(range(200))

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")


In [None]:
small_train_dataset

In [None]:
tf_train_dataset = small_train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=2,
)

tf_validation_dataset = small_valid_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=2,
)

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=5)


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)


In [None]:
# This is a sample code from the hugging face website

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

classes = ["not paraphrase", "is paraphrase"]

sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

# The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
# the sequence, as well as compute the attention masks.
paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")

paraphrase_classification_logits = model(**paraphrase).logits
not_paraphrase_classification_logits = model(**not_paraphrase).logits

paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]

# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")

# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

In [None]:
small_train_dataset

In [None]:
train_dataset_tokenized[0]['labels']