In [1]:
import pandas as pd
import numpy as np
import os

from transformers import BartTokenizer, BartForSequenceClassification, BartModel, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from datasets import load_dataset, Dataset, load_metric

# os.environ['CUDA_VIDIBLE_DEVICES'] = '0'

## Build Dataset

Convert the original text, stars pairs into dataset objects

*Note that star 1-5 are mapped into label 0-4*

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.

        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment).
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [3]:
train_df = load_data('train', columns=['text', 'stars'])
valid_df = load_data('valid', columns=['text', 'stars'])
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'])

select [text, stars] columns from the train split
Success
select [text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


In [4]:
# Prepare the data.
# As an example, we only use the text data.
x_train = train_df['text']
y_train = train_df['stars']

x_valid = valid_df['text']
y_valid = valid_df['stars']

x_test = test_df['text']

x_train_processed = pd.DataFrame(
    {'text': x_train, 'label': np.array(y_train.to_list())-1})
x_valid_processed = pd.DataFrame(
    {'text': x_valid, 'label': np.array(y_valid.to_list())-1})
# x_valid_processed.to_csv('data_processed/train.csv', index=None)
# x_valid_processed .to_csv('data_processed/valid.csv', index=None)
train_dataset = Dataset.from_pandas(x_train_processed)
valid_dataset = Dataset.from_pandas(x_valid_processed)

In [5]:
train_dataset[:5]

{'text': ["I've been here a handful of times now and I've never been disappointed.  The food is always good and the servers are quick.   So far my two favorite items are the Peppersauce Burger with pastrami and the Peppersauce Patty.  Even as I type this my mouth is watering and I just had the Peppersauce Burger.  \n\nThe burgers are well done and still juicy!  I always leave stuffed and happy.  The burgers can be a little on the greasy side, need two or three napkins.  I've also had them when you only needed on napkin to clean up.  Either way it was still tasty!\n\nI've seen a couple of people get salads and they are huge and look good.\n\nThe servers have always been friendly even when it was really busy.",
  'The service was terrible. The food was just ok. Dessert was the best part of the whole experience.',
  'Alil pricey for the location but completly get the bang for your buck sweet fries on point 100%',
  "Don't get your car washed here. Paid 11 and my car came out covered in so

## Load the model

In this pipeline, I try to use bert models from HuggingFace to do the test classification task

The Code below loads model and tokenized the dataset

In [6]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
# model = BartForSequenceClassification.from_pretrained("facebook/bart-large")


In [7]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)


In [8]:
train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
train_dataset_tokenized = train_dataset_tokenized.remove_columns(['text'])
train_dataset_tokenized = train_dataset_tokenized.rename_column(
    "label", "labels")
train_dataset_tokenized.set_format('torch')

valid_dataset_tokenized = valid_dataset.map(tokenize_function, batched=True)
valid_dataset_tokenized = valid_dataset_tokenized.remove_columns(['text'])
valid_dataset_tokenized = valid_dataset_tokenized.rename_column(
    "label", "labels")
valid_dataset_tokenized.set_format('torch')


HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))

2022-03-25 23:57:26.073939: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0





HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




## Define the model

In [9]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

In [10]:
class CustomModel(torch.nn.Module):
  def __init__(self, num_labels=5, checkpoint=None):
    super(CustomModel,self).__init__()
    self.num_labels = num_labels

    #Load Model with given checkpoint and extract its body
    if checkpoint == None:
        checkpoint = "facebook/bart-base"
    self.model = BartModel.from_pretrained(
        checkpoint, config=AutoConfig.from_pretrained(checkpoint, output_attentions=True, output_hidden_states=True))
    self.dropout = torch.nn.Dropout(0.1)
    self.classifier = torch.nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
    
    loss = None
    if labels is not None:
      loss_fct = torch.nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs[0])

In [11]:
model = CustomModel()

## Fine-Tune with pytorch

In [12]:
## Sub-sample a smaller dataset

small_train_dataset = train_dataset_tokenized.shuffle(seed=42).select(range(500))
small_valid_dataset = valid_dataset_tokenized.shuffle(seed=42).select(range(200))

In [13]:
# del model
# del pytorch_model
# del trainer
# torch.cuda.empty_cache()

In [14]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=2)
valid_dataloader = DataLoader(small_valid_dataset, batch_size=2)

In [15]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [16]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')
model.to(device)

CustomModel(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): BartEncoderL

In [17]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name='linear', optimizer=optimizer,
                             num_warmup_steps=0, num_training_steps=num_training_steps)

In [18]:
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    model.train()
    total_acc, total_loss, total_count = 0,0,0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        total_acc += (predictions == batch['labels']).sum().item()
        total_loss += loss.item()
        total_count += batch['labels'].size(0)

        progress_bar.update(1)
        progress_bar.set_postfix({'epoch':epoch,
            'loss': total_loss/total_count,
            'acc': total_acc/total_count})
    

    metric = load_metric("accuracy")
    model.eval()
    validation_progress_bar = tqdm(range(len(valid_dataloader)))
    for batch in valid_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        validation_progress_bar.update(1)
    validation_progress_bar.set_postfix({'valid_accurarcy':metric.compute()['accuracy']})



HBox(children=(FloatProgress(value=0.0, max=750.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

In [19]:
metric = load_metric("accuracy")
model.eval()
for batch in valid_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.64}

In [None]:
torch.save(model, "model-checkpoint/BERT_DISTILLED_TORCH.pkl")
model.save_pretrained("model-checkpoint/huggingface-bert-base-distilled-uncased")

In [20]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [21]:
model.eval()
y_valid_labels, y_pred_labels = [], []
for batch in valid_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    y_valid_labels.append(batch['labels'].cpu().numpy())
    y_pred_labels.append(predictions)


In [22]:
y_valid_labels = np.array(y_valid_labels).reshape(-1)
y_pred_labels = np.array(y_pred_labels).reshape(-1)

In [23]:
print(classification_report(y_valid_labels, y_pred_labels))
print('\n\n')
print(confusion_matrix(y_valid_labels, y_pred_labels))
print('accuracy', np.mean(y_valid_labels == y_pred_labels))


              precision    recall  f1-score   support

           0       0.86      0.80      0.83        30
           1       0.27      0.24      0.25        17
           2       0.44      0.65      0.53        23
           3       0.50      0.53      0.52        43
           4       0.81      0.71      0.76        87

    accuracy                           0.64       200
   macro avg       0.57      0.59      0.58       200
weighted avg       0.66      0.64      0.65       200




[[24  4  1  1  0]
 [ 3  4  8  2  0]
 [ 0  6 15  1  1]
 [ 1  1  4 23 14]
 [ 0  0  6 19 62]]
accuracy 0.64
