In [30]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

device = torch.device("cuda")

In [31]:
import pandas as pd

In [32]:
!pip install openpyxl

In [33]:
!pip install wandb

In [34]:
import wandb
import os
os.environ["WANDB_API_KEY"] = "351cc1ebc0d966d49152a4c1937915dd4e7b4ef5"

wandb.login(key="351cc1ebc0d966d49152a4c1937915dd4e7b4ef5")


In [35]:
wandb.init(project = "Sentiment Analysis")

In [36]:


path_dataset = "/kaggle/input/dataset-12-02/Dataset_02_12.xlsx"
dataframe = pd.read_excel(path_dataset, sheet_name = 'Dataset')


In [37]:
dataframe.head()

# **Split stratify Dataset**

In [38]:


train_text, val_text, train_labels, val_labels = train_test_split(dataframe['Review'], dataframe['Label'], 
                                                                    random_state=2021, 
                                                                    test_size=0.1, 
                                                                    stratify=dataframe['Label'])




In [39]:
!pip install transformers

In [40]:
import torch
from transformers import AutoModel, AutoTokenizer




In [41]:

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

In [42]:
# print(features.pooler_output)

In [43]:
class PhoBert_Classification(torch.nn.Module):
    def __init__(self, num_class):
        super(PhoBert_Classification, self).__init__()
        self.backbone = AutoModel.from_pretrained("vinai/phobert-base")
        
        self.dense_1 = torch.nn.Linear(in_features = 768, out_features = 128, bias=True)
        self.dense_2 = torch.nn.Linear(in_features = 128, out_features = num_class, bias=True)
        self.dropout1 = nn.Dropout(0.1)
        self.relu =  nn.ReLU()
        self.dropout2 = nn.Dropout(0.1)
        #softmax activation function (Log softmax)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, sent_id,mask):

        #get pooler_output of ['CLS'] token from bert output
        cls_hs= self.backbone(sent_id, attention_mask=mask).pooler_output
        x = self.dropout1(cls_hs)
        x = self.dense_1(cls_hs)

        x = self.relu(x)

        x = self.dropout2(x)

        # output layer
        x = self.dense_2(x)

        x = self.softmax(x)

        return x


In [44]:
model = PhoBert_Classification(2)


In [45]:
seq_len = [len(i.split()) for i in train_text]
pd.Series(seq_len).hist(bins = 30)

In [46]:
# tokenize and encode sequences in the training set
MAX_LENGTH = 200
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)

# # tokenize and encode sequences in the test set
# tokens_test = tokenizer.batch_encode_plus(
#     test_text.tolist(),
#     max_length = MAX_LENGTH,
#     pad_to_max_length=True,
#     truncation=True
# )

In [47]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# test_seq = torch.tensor(tokens_test['input_ids'])
# test_mask = torch.tensor(tokens_test['attention_mask'])
# test_y = torch.tensor(test_labels.tolist())

In [48]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 128

train_data = TensorDataset(train_seq, train_mask, train_y)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)

val_sampler = SequentialSampler(val_data)

val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [49]:
# test_tensordata = TensorDataset(test_seq, test_mask, test_y)
# test_sampler =  SequentialSampler(test_tensordata)
# test_dataloader = DataLoader(test_tensordata, sampler = test_sampler, batch_size=1)

# **Build model with backbone and pretrained Bert base uncased**

In [50]:
device = torch.device("cuda")
model = model.to(device)

In [51]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr = 1e-5) 

In [52]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)

print("Class Weights:",class_weights)

In [53]:
# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 120

In [54]:
def train(model):
  model.train()
  total_loss, total_accuracy = 0, 0
  total_preds=[]
  for step,batch in enumerate(train_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch
    model.zero_grad()        
    preds = model(sent_id, mask)
    loss = cross_entropy(preds, labels)
    total_loss = total_loss + loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    preds=preds.detach().cpu().numpy()
    total_preds.append(preds)

  avg_loss = total_loss / len(train_dataloader)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [55]:
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score



In [56]:
def evaluate(model, t_dataset_loader):
  
    print("\nEvaluating...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []
    total_groundtruth = []

    # iterate over batches
    for step,batch in enumerate(t_dataset_loader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch


        # deactivate autograd
        with torch.no_grad():

            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)
            
            out_labels = labels.detach().cpu().numpy()
            total_groundtruth.append(out_labels)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0,)
    total_preds = np.argmax(total_preds, axis=1)
    total_preds = np.array(total_preds, dtype = np.int16)
    total_groundtruth = np.concatenate(total_groundtruth, axis = 0)
    total_groundtruth = np.array(total_groundtruth, dtype = np.int16)

    #F1 score
    focus_f1 = f1_score(total_groundtruth, total_preds)
    print("Accuracy: ", accuracy_score(total_groundtruth, total_preds))
    print("F1 score: ", focus_f1)
    print('Recall:', recall_score(total_groundtruth, total_preds))
    print('Precision:', precision_score(total_groundtruth, total_preds))
    print('\n clasification report:\n', classification_report(total_groundtruth,total_preds))
    print('\n confussion matrix:\n',confusion_matrix(total_groundtruth, total_preds))
    


    return avg_loss, total_preds, focus_f1

# **Training and Validation**

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')
best_valid_f1 = 0

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
    print("Start")
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #Freeze in 30 epoch first
    if epoch <= 10:
        for param in model.backbone.parameters():
            param.requires_grad = False
    else:
        for param in model.bert.parameters():
            param.requires_grad = True
    
    #train model
    train_loss, _ = train(model)
    wandb.log({"Loss train": train_loss})
    #evaluate model
    valid_loss, _, f1_value = evaluate(model, val_dataloader)
    wandb.log({"Loss val": valid_loss})
    wandb.log({"F1 score": f1_value})
    #save the best model
    if f1_value > best_valid_f1:
        best_valid_f1 = f1_value
        torch.save(model.state_dict(), '/kaggle/working/Best_weights_f1.pt')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    torch.save(model.state_dict(), '/kaggle/working/Lass_weights_f1.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

# **Save and reload Pretrained**

In [None]:
# best_model = BERT_sentiment_analysis(bert)
# best_model.load_state_dict(torch.load("../input/weight-model/Best_weights_f1.pt",map_location=device))
# best_model = best_model.to(device)

# **Eval in test dataset**

In [None]:
# _ = evaluate(best_model, test_dataloader)

In [None]:
# def inference(model, string_input):
#     model.eval()
    
#     tokens_inference = tokenizer.batch_encode_plus(
#         [string_input],
#         max_length = MAX_LENGTH,
#         pad_to_max_length=True,
#         truncation=True
#         )
#     inference_seq = torch.tensor(tokens_inference['input_ids'])
#     inference_mask = torch.tensor(tokens_inference['attention_mask'])
#     sent_id = inference_seq.to(device)
#     mask = inference_mask.to(device)
#     preds = model(sent_id, mask)
#     preds = preds.detach().cpu().numpy()
#     class_predict = np.argmax(preds, axis = 1)
#     print("Class predict: ",class_predict[0])

    

# **Input and inference a sentence**

In [None]:
# string_input = input("Enter your string: ")
# inference(best_model, string_input)