# ***Install Library***



In [None]:
!pip install -qq transformers
!pip install -qq vncorenlp
!pip install -qq torchsummaryX

# ***Import Library***

In [None]:
import os
import pandas as pd
import torch
import seaborn as sns
import pickle
import numpy as np

from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from vncorenlp import VnCoreNLP

from torch.utils.data import Dataset, DataLoader
from torchsummaryX import summary
from torch import nn
from torch.nn import Linear, Dropout, ReLU, Softmax
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from tqdm import tqdm_notebook
from pylab import rcParams
import matplotlib.pyplot as plt
from collections import defaultdict
import warnings
import logging

logging.basicConfig(level=logging.ERROR)

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14 ,8

RANDOM_SEED = 50
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

tqdm_notebook().pandas()

# Setup Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device: ', device)

# ***Creat Datasets***

In [None]:
class Datasets(Dataset):
  def __init__(self, comments, labels, tokenizer, max_len):
    self.comments = comments
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __getitem__(self, index):
    comment = str(self.comments[index])
    label = self.labels[index]

    encoding = self.tokenizer.encode_plus(
        comment,
        max_length = self.max_len,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_token_type_ids=False,
        truncation = True,
        return_tensors='pt'
    )
    return {
        'comment': comment,
        'input_id': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(label, dtype=torch.long)
    }

  def __len__(self):
    return len(self.comments)


def create_data_loader(df, tokenizer, max_len, batch_size):
  datasets = Datasets(
      comments=df.comments.to_numpy(),
      labels=df.labels.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
  )
  return DataLoader(
      datasets,
      batch_size=batch_size,
      num_workers=4
  )


def preprocess_data(df):
  """
    Apply word segmenter to produce word-segmented texts before feeding to PhoBERT.
    Using RDRSegmenter from VNCoreNLP to pre-process the pre_training data.
  """
  with VnCoreNLP("/content/drive/MyDrive/DATN/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') as rdrsegmenter:
    df["comments"] = df["comments"].apply(str).progress_apply(lambda x: ' '.join([' '.join(sent) for sent in rdrsegmenter.tokenize(x)]))

  return df

# ***Build Model***

In [None]:
class ClassifierModel(nn.Module):
  """
  Arguments:
      bert (model): model BERT to extract features
      n_class (int): number of class
  """
  def __init__(self, bert, n_class, drop_prob=0.3):
    super().__init__()

    self.bert = bert
    self.drop_prob = drop_prob
    self.n_class = n_class

    # Fully Connected Layers
    self.fc1 = Linear(in_features=768, out_features=512)
    self.fc2 = Linear(in_features=512, out_features=self.n_class)
    
    # Activate Function Layers
    self.relu = ReLU()
    self.soft_max = Softmax(dim=1)

    # Dropout Layer
    self.drop_out = Dropout(self.drop_prob)

  def forward(self, sentences_id, attention_mask):
    _, pooled_out = self.bert(sentences_id, attention_mask, return_dict=False)
    # pooled_out size: [batch_size, 768]
    x = self.drop_out(pooled_out)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.drop_out(x)
    x = self.fc2(x)
    return x

# ***Load and Preprocess Data***

In [None]:
data = pd.read_csv("/content/drive/MyDrive/DATN/data/data.csv", header=0)
data = preprocess_data(data)

df_train, df_val = train_test_split(data, test_size=0.1, random_state=RANDOM_SEED)

print(data)

# ***Plot Data***


In [None]:
ax = sns.countplot(data.labels)
class_name = ['Khong tot', 'Trung binh', 'Tot', 'Spam']
plt.ylabel("Amount")
ax.set_xticklabels(class_name)

In [None]:
rcParams['figure.figsize'] = 32 ,12
sns.set(style='whitegrid', palette='muted', font_scale=1.2)

line_len = []
for cmt in data.comments:
  line_len.append(len(cmt.split()))

ax1 = sns.countplot(line_len)
plt.xlabel("Sentences Length")
plt.ylabel("Amount")

# ***Load Bert model***

In [None]:
def load_bert(model_name):
  """
    Pre-trained PhoBERT models are the state-of-the-art language models for Vietnamese
    (Pho, i.e. "Phở", is a popular food in Vietnam)
    Pre-trained name:
        PhoBERT Base:  "vinai/phobert-base"
        PhoBERT Large: "vinai/phobert-large"
  """
  phobert = AutoModel.from_pretrained(model_name)
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  return tokenizer, phobert

# ***Training***

In [None]:
PRE_TRAINED_MODEL_NAME = "vinai/phobert-base"
BATCH_SIZE = 64
NUM_EPOCHS = 10
NUM_CLASS = 4
MAX_LEN = 50


# Load Pre_trained PhoBERT
tokenizer, phobert = load_bert(PRE_TRAINED_MODEL_NAME)

#Create Data Loader
dl_train = create_data_loader(df_train, tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
dl_val = create_data_loader(df_val, tokenizer, max_len=MAX_LEN, batch_size=8)
n_samples = [len(df_train), len(df_val)]

# Init Model
model = ClassifierModel(phobert, NUM_CLASS)
model = model.to(device)

# Init Parameters
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
loss_fn = nn.CrossEntropyLoss().to(device)

# Init Scheduler
total_steps = len(dl_train) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [None]:
class Trainer():
  def __init__(self, model, 
               train_loader, val_loader, 
               epochs, loss_fn, optimizer,
               scheduler, n_samples,
               device, save_path):
    self.model = model.to(device)
    self.train_loader = train_loader
    self.val_loader = val_loader
    self.epochs = epochs
    self.loss_fn = loss_fn
    self.optimizer = optimizer
    self.n_samples = n_samples
    self.scheduler = scheduler
    self.device = device
    self.history = defaultdict(list)
    self.save_path = save_path

  def train(self):
    self.model = self.model.train()
    losses = []
    correct_preds = 0

    for step, data in enumerate(self.train_loader):
      print(f'\rTraning Step {step+1}/{len(self.train_loader)}', end='')

      # Data to device
      input_ids = data['input_id'].to(self.device)
      attention_masks = data['attention_mask'].to(self.device)
      targets = data['label'].to(self.device)

      #Outputs model
      outputs = self.model(input_ids, attention_masks)

      _, preds = torch.max(outputs, dim=1)

      loss = self.loss_fn(outputs, targets)
      correct_preds += torch.sum(preds == targets).detach().cpu().numpy()
      losses.append(loss.item())

      loss.backward()
      nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
    
      self.optimizer.step()
      self.scheduler.step()
      self.optimizer.zero_grad()

    return correct_preds/self.n_samples[0], np.mean(losses)

  def evaluate(self):
    self.model = self.model.eval()
    losses = []
    correct_preds = 0
    with torch.no_grad():
      for data in self.val_loader:

        input_ids = data['input_id'].to(self.device)
        attention_masks = data['attention_mask'].to(self.device)
        targets = data['label'].to(self.device)

        outputs = self.model(input_ids, attention_masks)

        _, preds = torch.max(outputs, dim=1)

        loss = self.loss_fn(outputs, targets)

        correct_preds += torch.sum(preds == targets).detach().cpu().numpy()
        losses.append(loss.item())

    return correct_preds / self.n_samples[1], np.mean(losses)

  def training(self):
    best_accuracy = 0.0

    for epoch in range(self.epochs):
      print(f'\nEpoch {epoch+1}/{self.epochs}')
      print("--"*10)

      # Train
      train_acc, train_loss = self.train()
      print(f'\rTrain loss: {train_loss:.4f}, Train accuracy: {train_acc*100:.2f}')

      #Valuate
      val_acc, val_loss = self.evaluate()
      print(f'Val   loss: {val_loss:.4f}, Val   accuracy: {val_acc*100:.2f}')

      self.history['train_acc'].append(train_acc)
      self.history['train_loss'].append(train_loss)
      self.history['val_acc'].append(val_acc)
      self.history['val_loss'].append(val_loss)

      model_path = os.path.join(self.save_path + str(val_acc) + "best_model_state.bin")

      if val_acc > best_accuracy:
        torch.save(model.state_dict(), model_path)
        best_accuracy = val_acc

    print('\nCompleted Training')
    
    return self.history


In [None]:
save_path = '/content/drive/MyDrive/DATN/model'
train = Trainer(model, dl_train, dl_val, NUM_EPOCHS, loss_fn, optimizer, scheduler, n_samples, device, save_path)

In [None]:
result = train.training()

In [None]:
torch.save(model, '/content/drive/MyDrive/DATN/model/model.pt')

# ***Plot Loss, Accuracy***

In [None]:
rcParams['figure.figsize'] = 16 ,8
sns.set(style='whitegrid', palette='muted', font_scale=2)

In [None]:
plt.plot(result['train_acc'], label='train accuracy')
plt.plot(result['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])

In [None]:
plt.plot(result['train_loss'], label='train loss')
plt.plot(result['val_loss'], label='validation loss')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

# ***Evaluation***

In [None]:
def get_predictions(model, data_loader, device):
    model = model.eval()

    cmt_texts = []
    predictions = []
    prediction_probs = []
    real_labels = []

    with torch.no_grad():
        for sample in data_loader:
            comment = sample['comment']
            input_id = sample['input_id'].to(device)
            attention_mask = sample['attention_mask'].to(device)
            label = sample['label'].to(device)

            outputs = model(input_id, attention_mask)

            _, preds = torch.max(outputs, dim=1)
            probs = F.softmax(outputs, dim=1)

            cmt_texts.extend(comment)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_labels.extend(label)   

    predictions = torch.stack(predictions).cpu()
    precition_probs = torch.stack(prediction_probs).cpu()
    real_labels = torch.stack(real_labels).cpu()

    return cmt_texts, predictions, prediction_probs, real_labels

def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True label')
  plt.xlabel('Predicted label');

In [None]:
class_names = ['Bad', 'Medium', 'Good', 'Spam']

y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(model,
                                                               dl_val,
                                                               device)

print(classification_report(y_test, y_pred, target_names=class_names))

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)                                       

In [None]:
from textwrap import wrap
idx = 7

review_text = y_review_texts[idx]
true_sentiment = y_test[idx]
pred_df = pd.DataFrame({
  'class_names': class_names,
  'values': y_pred_probs[idx].cpu().data.numpy().argmax()
})

print("\n".join(wrap(review_text)))
print()
print(f'True label: {class_names[true_sentiment]}')