In [1]:
import pandas as pd
import numpy as np
import os

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, load_metric

# os.environ['CUDA_VIDIBLE_DEVICES'] = '0'

## Build Dataset

Convert the original text, stars pairs into dataset objects

*Note that star 1-5 are mapped into label 0-4*

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [3]:
train_df = load_data('train', columns=['text', 'stars'])
valid_df = load_data('valid', columns=['text', 'stars'])
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'])

select [text, stars] columns from the train split
Success
select [text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


In [4]:
# Prepare the data.
# As an example, we only use the text data.
x_train = train_df['text']
y_train = train_df['stars']

x_valid = valid_df['text']
y_valid = valid_df['stars']

x_test = test_df['text']

x_train_processed = pd.DataFrame(
    {'text': x_train, 'label': np.array(y_train.to_list())-1})
x_valid_processed = pd.DataFrame(
    {'text': x_valid, 'label': np.array(y_valid.to_list())-1})
# x_valid_processed.to_csv('data_processed/train.csv', index=None)
# x_valid_processed .to_csv('data_processed/valid.csv', index=None)
train_dataset = Dataset.from_pandas(x_train_processed)
valid_dataset = Dataset.from_pandas(x_valid_processed)

In [5]:
train_dataset[:5]

{'text': ["I've been here a handful of times now and I've never been disappointed.  The food is always good and the servers are quick.   So far my two favorite items are the Peppersauce Burger with pastrami and the Peppersauce Patty.  Even as I type this my mouth is watering and I just had the Peppersauce Burger.  \n\nThe burgers are well done and still juicy!  I always leave stuffed and happy.  The burgers can be a little on the greasy side, need two or three napkins.  I've also had them when you only needed on napkin to clean up.  Either way it was still tasty!\n\nI've seen a couple of people get salads and they are huge and look good.\n\nThe servers have always been friendly even when it was really busy.",
  'The service was terrible. The food was just ok. Dessert was the best part of the whole experience.',
  'Alil pricey for the location but completly get the bang for your buck sweet fries on point 100%',
  "Don't get your car washed here. Paid 11 and my car came out covered in so

## Load the model

In this pipeline, I try to use bert models from HuggingFace to do the test classification task

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased",num_labels=5)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [7]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)


In [8]:
train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
train_dataset_tokenized = train_dataset_tokenized.remove_columns(['text'])
train_dataset_tokenized = train_dataset_tokenized.rename_column(
    "label", "labels")
train_dataset_tokenized.set_format('torch')

valid_dataset_tokenized = valid_dataset.map(tokenize_function, batched=True)
valid_dataset_tokenized = valid_dataset_tokenized.remove_columns(['text'])
valid_dataset_tokenized = valid_dataset_tokenized.rename_column(
    "label", "labels")
valid_dataset_tokenized.set_format('torch')


HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))

2022-03-23 21:12:00.022996: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0





HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




## Fine-Tune with pytorch

In [9]:
## Sub-sample a smaller dataset

small_train_dataset = train_dataset_tokenized.shuffle(seed=42).select(range(500))
small_valid_dataset = valid_dataset_tokenized.shuffle(seed=42).select(range(200))

In [10]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

In [11]:
# del model
# del pytorch_model
# del trainer
# torch.cuda.empty_cache()

In [12]:
train_dataloader = DataLoader(train_dataset_tokenized, shuffle=True, batch_size=8)
valid_dataloader = DataLoader(valid_dataset_tokenized, batch_size=8)

In [13]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [14]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name='linear', optimizer=optimizer,
                             num_warmup_steps=0, num_training_steps=num_training_steps)

In [16]:
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    model.train()
    total_acc, total_loss, total_count = 0,0,0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        total_acc += (predictions == batch['labels']).sum().item()
        total_loss += loss.item()
        total_count += batch['labels'].size(0)

        progress_bar.update(1)
        progress_bar.set_postfix({'epoch':epoch,
            'loss': total_loss/total_count,
            'acc': total_acc/total_count})
    

    metric = load_metric("accuracy")
    model.eval()
    validation_progress_bar = tqdm(range(len(valid_dataloader)))
    for batch in valid_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        validation_progress_bar.update(1)
    validation_progress_bar.set_postfix({'valid_accurarcy':metric.compute()['accuracy']})



HBox(children=(FloatProgress(value=0.0, max=22500.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1362.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

In [17]:
metric = load_metric("accuracy")
model.eval()
for batch in valid_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.6875}

In [19]:
torch.save(model, "model-checkpoint/BERT_TORCH.pkl")
model.save_pretrained("model-checkpoint/huggingface-bert-base-cased")