<a href="https://colab.research.google.com/github/LemurPwned/filmweb-nlp/blob/master/Transformers_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers



In [3]:
import pandas as pd 
from transformers import BertForSequenceClassification, BertTokenizer
model_name = 'bert-base-multilingual-cased'
merge = True
if merge:
  df = pd.DataFrame()
  for filename in ['reviews_for_bert.csv', 'reviews_for_bert_150_150.csv',
                   'reviews_for_bert_50_150.csv',
                   'reviews_for_bert_300_150.csv']:
    tmp = pd.read_csv(filename)
    df = pd.concat((df, tmp))
else:
  df = pd.read_csv('/content/reviews_for_bert.csv')
labels = df['rating'].unique().tolist()
if 0 not in labels:
  # labels must start from 0
  labels = [label - 1 for label in labels]

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=len(labels))

100%|██████████| 995526/995526 [00:00<00:00, 5805180.59B/s]
100%|██████████| 521/521 [00:00<00:00, 263281.01B/s]
100%|██████████| 714314041/714314041 [00:13<00:00, 51588527.91B/s]


In [0]:
from transformers.data.processors.utils import InputExample
from transformers.data.processors.glue import glue_convert_examples_to_features
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import torch.utils.data as data_utils
import numpy as np 
import torch

BATCH_SIZE = 32
MAX_SEQ_LEN = 128


def select_field(features, field):
  return [
        [
            feature[field]
        ]
        for feature in features
    ]
  
def prepare_dataset(df):
  df_sampl = df
  sents = df_sampl['content'].values
  labels = df_sampl['rating'].astype(int).values
  labels_unique = df['rating'].astype(int).unique()
  # print(labels_unique)
  # print(labels)
  # label_map = {i: i for i in range(11)}
  guid =0 
  input_examples = []
  for sent, label in zip(sents, labels):
    input_examples.append(InputExample(
        guid, text_a=sent, text_b=None, label=label
    ))
  tokenizer = BertTokenizer.from_pretrained(model_name)
  features = glue_convert_examples_to_features(
      input_examples,
      tokenizer=tokenizer,
      max_length=MAX_SEQ_LEN,
      output_mode='classification',
      label_list = labels_unique
  )


  all_input_ids = torch.tensor([feature.input_ids for feature in features], dtype=torch.long)
  all_input_mask = torch.tensor([feature.attention_mask for feature in features], dtype=torch.long)
  all_segment_ids = torch.tensor([feature.token_type_ids for feature in features], dtype=torch.long)
  all_label_ids = torch.tensor([feature.label for feature in features], dtype=torch.long)

  dataset = TensorDataset(all_input_ids, 
                          all_input_mask, 
                          all_segment_ids, 
                          all_label_ids)
  return dataset 

msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]
train_datatset = prepare_dataset(train)
test_dataset = prepare_dataset(test)
train_dataloader = data_utils.DataLoader(train_datatset, 
                                         batch_size=BATCH_SIZE, 
                                         shuffle=True)
test_dataloader = data_utils.DataLoader(test_dataset, 
                                        batch_size=BATCH_SIZE, 
                                        shuffle=True)

In [0]:
from transformers import AdamW, get_linear_schedule_with_warmup

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

lr = 2e-5
warmup=0.1

max_grad_norm = 1.0
num_total_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1

# optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False,
                  # )  # To reproduce BertAdam specific behavior set correct_bias=False
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=num_warmup_steps, 
                                            num_training_steps=num_total_steps)  # PyTorch scheduler
### and used like this:



In [9]:
from tqdm import trange 
### and used like this:
# for batch in train_data:
#     loss = model(batch)
#     loss.backward()
#     torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
#     optimizer.step()
#     scheduler.step()
#     optimizer.zero_grad()
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

EPOCHS_NUM = 4
loss_history = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)
model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=len(labels))
model.cuda()

for epoch in trange(EPOCHS_NUM, desc='EPOCH'):
  # set to training mode
  model.train()
  total_loss =0
  for step, batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_segment_ids, b_labels = batch
    outputs = model(b_input_ids, 
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels)
    loss = outputs[0]
    loss_history.append(loss.item())
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    total_loss += loss.item()
  print(f"Total loss is {total_loss}")

  model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps = 0
  for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_segment_ids, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask)[0]
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

EPOCH:   0%|          | 0/4 [00:00<?, ?it/s]

Total loss is 304.87768745422363


EPOCH:  25%|██▌       | 1/4 [03:07<09:22, 187.54s/it]

Validation Accuracy: 0.008787878787878787
Total loss is 304.9172909259796


EPOCH:  50%|█████     | 2/4 [06:15<06:15, 187.74s/it]

Validation Accuracy: 0.008522727272727272
Total loss is 305.08406233787537


EPOCH:  75%|███████▌  | 3/4 [09:23<03:07, 187.87s/it]

Validation Accuracy: 0.008522727272727272
Total loss is 304.8327236175537


EPOCH: 100%|██████████| 4/4 [12:32<00:00, 187.97s/it]

Validation Accuracy: 0.008522727272727272



