<a href="https://colab.research.google.com/github/LnG-a/Machine_Learning_Group_TLA/blob/main/food_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
import os

path = '/content/drive/My Drive/BERT/'
drive.mount('/content/drive/')
os.chdir(path)

Mounted at /content/drive/


In [4]:
!pip install transformers
!pip install fastBPE
!pip install fairseq
!pip install vncorenlp

!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 34.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 75.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 50.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastBPE
  Downloading fastBPE-0.1.0.tar.gz (35 kB)
Building wheels for collected packages: fastBPE
  Building wheel for f

In [5]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default=path+"/PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file(path+"/PhoBERT_base_transformers/dict.txt")

In [6]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP(path+"/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

def checkWords(argument):
    switcher = {
        "k": "không",
        "ko": "không",
        "j":"gì",
        "đc":"được",
        "dc":"được",
        "ntn":"như thê nào",
        "ok":"ổn",
        "ncl":"nói chung là",
        "mn":"mọi người",
        "mng":"mọi người",
        "vs":"với",
        "cx":"cũng",
        "bt":"bình thường",
        "bth":"bình thường",
        "nv":"nhân viên",
        "recommend":"gợi ý tốt",
        "mk":"mình",
    }

    return switcher.get(argument, argument)

def standardizeData(comment):
    comment = rdrsegmenter.tokenize(comment)

    for sentence in comment:
        for i in range(0,len(sentence)):
            sentence[i]=sentence[i].lower()
            sentence[i]=checkWords(sentence[i])
    
    comment = ' '.join([' '.join(sentence) for sentence in comment])

    comment = comment.replace(",", "").replace(".", "") \
    .replace(";", "").replace("“", "") \
    .replace(":))", "cười").replace("”", "") \
    .replace('"', "").replace("'", "") \
    .replace("!", "").replace("?", "") \
    .replace("-", "").replace(":","")
    
    # comment = comment.replace("+"," cộng ") \
    # .replace("^ ^", " vui yêu thích ngon tuyệt_vời ") \
    # .replace("❤️", " vui yêu thích ngon tuyệt_vời ") \
    # .replace("😂", " cười vui_vẻ ") \
    # .replace("👍"," hài_lòng yêu thích ") 

    return comment

In [7]:
import csv
# import re

train_path = path+'/full_train.csv'
test_path = path+'/test.csv'

train_id, train_text, train_labels = [], [], []
test_id, test_text = [], []


with open(train_path) as f:
    reader = csv.reader(f)

    for row in reader:
        if row[3] == 'Comment':
            continue
        id = row[1]

        comment = standardizeData(row[3])
  
        label = int(row[5])
        train_id.append(id)
        train_text.append(comment)
        train_labels.append(label)

with open(test_path) as f:
    reader = csv.reader(f)
    for row in reader:
        if row[3] == 'Comment':
            continue
        id = row[1]
        comment = standardizeData(row[3])
        test_id.append(id)
        test_text.append(comment)
        

In [8]:
from sklearn.model_selection import train_test_split

train_sents, val_sents, train_labels, val_labels = train_test_split(train_text, train_labels, test_size=0.1)

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LEN = 256

train_ids = []
for sent in train_sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    
    train_ids.append(encoded_sent)

val_ids = []
for sent in val_sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    
    val_ids.append(encoded_sent)

train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
val_ids = pad_sequences(val_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")


In [10]:
train_masks = []
for sent in train_ids:
    mask = [int(token_id > 0) for token_id in sent]
    train_masks.append(mask)

val_masks = []
for sent in val_ids:
    mask = [int(token_id > 0) for token_id in sent]
    val_masks.append(mask)

In [11]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

train_inputs = torch.tensor(train_ids)
val_inputs = torch.tensor(val_ids)
train_labels = torch.tensor(train_labels)

val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

In [12]:
test_ids = []
test_labels = []
test_masks = []

for sent in test_text:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    test_ids.append(encoded_sent)

test_ids = pad_sequences(test_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

for sent in test_ids:
    mask = [int(token_id > 0) for token_id in sent]
    test_masks.append(mask)

for i in range(0,len(test_masks)):
  test_labels.append(0)

test_inputs = torch.tensor(test_ids)
test_masks = torch.tensor(test_masks)
test_labels = torch.tensor(test_labels)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

In [13]:
from torch import nn
from transformers import RobertaModel,RobertaForSequenceClassification, RobertaConfig, AdamW

config = RobertaConfig.from_pretrained(
    path+"/PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 2, output_hidden_states=False,
)


# class PhoBERT(RobertaForSequenceClassification):
#    config_class = RobertaConfig
#    base_model_prefix = "roberta"
#    def __init__(self, config):
#        super(PhoBERT, self).__init__(config)
#        self.num_labels = config.num_labels
#        self.roberta = RobertaModel(config)
#        self.qa_outputs = nn.Linear(4*config.hidden_size, self.num_labels)

#        self.init_weights()

#    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
#                 start_positions=None, end_positions=None):

#        outputs = self.roberta(input_ids,
#                             attention_mask=attention_mask,
# #                            token_type_ids=token_type_ids,
#                             position_ids=position_ids,
#                             head_mask=head_mask)
#        cls_output = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1)
#        logits = self.qa_outputs(cls_output)
#        return 

model = RobertaForSequenceClassification.from_pretrained(
    path+"PhoBERT_base_transformers/model.bin",
    config=config
)


model.cuda()


You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at /content/drive/My Drive/BERT/PhoBERT_base_transformers/model.bin were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification 

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=0)
      (position_embeddings): Embedding(258, 768, padding_idx=0)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [14]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def flat_accuracy(preds, labels):
    
    F1_score = f1_score(pred_flat, labels_flat, average='macro')
    
    return accuracy_score(pred_flat, labels_flat), F1_score
def pred(logits):
    pred = np.sum(logits, axis=1)
    for x in pred:
        if(sigmoid(x)>=0.6):
            x=1
        else:
            x=0
    return pred

In [None]:
import random
from tqdm import tqdm_notebook
from sklearn import metrics

device = 'cuda'
epochs = 3

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)


for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    model.train()
    train_accuracy = 0
    train_auc =0
    nb_train_steps = 0
    train_f1 = 0
    
    for step, batch in tqdm_notebook(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        tmp_train_accuracy, tmp_train_f1 = flat_accuracy(pred_flat, labels_flat)
        
        #train_auc += metrics.roc_auc_score(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        train_f1 += tmp_train_f1
        nb_train_steps += 1

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))

    print("Running Validation...")
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    eval_auc=0
    for batch in tqdm_notebook(val_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            pred_flat = np.argmax(logits, axis=1).flatten()
            labels_flat = label_ids.flatten()
            
            tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)
            
            #eval_auc += metrics.roc_auc_score(logits, label_ids) 
            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1
    print("Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print("F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))
    #print("AUC score: {0:.4f}".format(eval_auc/nb_eval_steps))

print("Training complete!")

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_dataloader)):


0it [00:00, ?it/s]

In [None]:
output_path = path+'/test_output_3_epochs_1e-5.csv'

output_file = [["RevID","Rating"]]
id = 0;

print("Running Test...")
model.eval()
for batch in tqdm_notebook(test_dataloader):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, 
        token_type_ids=None, 
        attention_mask=b_input_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()

        for i in pred_flat:
          #result = f'{id} {i}'
          #print(result)
          output_file.append([test_id[id],i])
          id+=1

with open(output_path, 'w') as f:
    file_writer = csv.writer(f) # create csv writer
    file_writer.writerows(output_file) 
    f.close() # close file 

print("Testing complete!")

In [None]:
torch.save(model, "/content/drive/MyDrive/BERT/model.pth")
#model = torch.load("/content/drive/MyDrive/BERT/model.pth")