# Vietnamese Emotion Classification using PhoBERT
- Input:
    - Sentence
- Output:


## 1. Import library

In [2]:
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv

from wordcloud import WordCloud
import plotly.express as px
from collections import Counter
from nltk.corpus import stopwords
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, f1_score, make_scorer
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

stop_words = set(stopwords.words('vietnamese')) 

In [4]:
# Load pre-trained PhoBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")
encode_lable = LabelEncoder()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Craw Data

In [5]:
def save(name, a):
    df = pd.DataFrame.from_dict(a, orient='index')
    df = df.transpose()
    dataset = pd.DataFrame(data=df)
    dataset.to_excel(f'./DataSet/{name}.xlsx', encoding='utf-8-sig')

In [6]:
def get_data(name, dict_url):

    driver = webdriver.Edge("./msedgedriver.exe")

    Contents = []
    Topic = []
    dictionary = {'Content': Contents, 'Topic': Topic}

    for _topic, quantity in dict_url.items():
        driver.get(f'https://vtv.vn/{_topic}.htm')
        content = []
        count = 0
        status = True
        while status:
            items = driver.find_elements(By.XPATH, '/html/body/form/div[2]/div[3]/div[3]/div/div[1]/div[1]/div[3]/div[1]/ul/li')

            for item in items:
                if count >= quantity:
                    status = False
                    break
                # Tìm tiêu đề tin tức
                content.append(item.find_element(By.XPATH,  './/h4/a').text)
                topic = [_topic for _ in range(len(content))]
                count += 1
            # Tìm nút "Xem thêm"
            load_more_button = driver.find_element(By.XPATH, '/html/body/form/div[2]/div[3]/div[3]/div/div[1]/div[1]/div[3]/div[2]/a')

            if not load_more_button.is_displayed():
                break
            load_more_button.click()

            time.sleep(5)
        Topic.extend(topic)
        Contents.extend(content)
    save(name, dictionary)

In [7]:
dict_url_big = {    'chinh-tri':        1856,
                    'xa-hoi':           1005,
                    'kinh-te':          2326,
                    'truyen-hinh':      2164,
                    'cong-nghe':        548,
                    'doi-song':         1456,
                    'van-hoa-giai-tri': 1564
                }

dict_url_small = {  'chinh-tri':        213,
                    'xa-hoi':           113,
                    'kinh-te':          566,
                    'truyen-hinh':      36,
                    'cong-nghe':        156,
                    'doi-song':         102,
                    'van-hoa-giai-tri': 189
                }

# get_data("big_data", dict_url_big)
# get_data("small_data", dict_url_small)

## Load Data

In [8]:
def read_data(path):
    df = pd.read_excel(path, sheet_name=None)['Sheet1']
    df.columns = ['index', 'Content', 'Topic']
    # unused column
    df.drop(columns=['index'], inplace=True)
    return df

In [9]:
df_small = read_data('./DataSet/small_data.xlsx')
df_big = read_data('./DataSet/big_data.xlsx')

In [10]:
df_small.head()

Unnamed: 0,Content,Topic
0,Quán triệt các quy định của Ban Bí thư về báo ...,chinh-tri
1,Phó Thủ tướng Trần Hồng Hà: Không để thiếu vac...,chinh-tri
2,ASEAN cần thích ứng năng động và tăng cường sứ...,chinh-tri
3,Tiếp tục hoàn thiện cơ chế xác định giá đất,chinh-tri
4,Bộ Công an trao tặng Giải thưởng Trần Quốc Hoà...,chinh-tri


In [11]:
df_big.head()

Unnamed: 0,Content,Topic
0,Thủ tướng Phạm Minh Chính tiếp xúc cử tri trướ...,chinh-tri
1,Thủ tướng bổ nhiệm lại Thứ trưởng Bộ Công Thương,chinh-tri
2,Thủ tướng Phạm Minh Chính dự khai mạc Lễ hội H...,chinh-tri
3,“Không có vướng mắc nào của địa phương không đ...,chinh-tri
4,"Dù ở vị trí nào mà có trách nhiệm, khát khao c...",chinh-tri


## Exploratory data analysis

In [12]:
def plot_topic(df):
    plt.figure(figsize=(20, 10))
    sns.countplot(x='Topic', data=df)
    plt.title("Number of sample")
    plt.show()

In [13]:
#plot_topic(df_small)

In [14]:
#plot_topic(df_big)

In [15]:
#df_big['Topic'].value_counts()

In [16]:
#df_small['Topic'].value_counts()

In [17]:
def group_topic(topic):
    if topic in ['chinh-tri']:
        return 'chinh-tri'
    elif (topic in ['kinh-te']):
        return 'kinh-te'
    elif (topic in ['cong-nghe']):
        return 'Công-nghệ'
    elif (topic in ['xa-hoi']):
        return 'xa-hoi'
    else: 
        return 'Khác'

In [18]:
df_small['Topic'] = df_small['Topic'].apply(lambda topic: group_topic(topic))
df_big['Topic'] = df_big['Topic'].apply(lambda topic: group_topic(topic))

In [19]:
#df_small['Topic'].value_counts()

In [20]:
#plot_topic(df_small)

In [21]:
#df_big['Topic'].value_counts()

In [22]:
#plot_topic(df_big)

In [23]:
def plot_count_token():
    all_data = df_big['Content'].tolist() + df_small['Content'].tolist()
    token_text = [tokenizer.encode(text, add_special_tokens=True) for text in all_data]
    token_lens = [len(text) for text in token_text]

    sns.set(rc={'figure.figsize':(20, 10)})
    sns.displot(token_lens, height=5, aspect=3)
    plt.xlim([0,max(token_lens)])
    plt.xlabel('Token Count')
    plt.show()

In [24]:
#plot_count_token()

In [25]:
def top_word():
    top = Counter([item for sublist in df_big['Content'].apply(lambda x:str(x).split()) for item in sublist])
    print(len(top))
    temp = pd.DataFrame(top.most_common(50))
    temp.columns = ['Common_words','count']
    return temp

In [26]:
top_word = top_word()
#top_word.style.background_gradient(cmap='Blues')

3334


In [27]:

# fig = px.bar(top_word, x="count", y="Common_words", title='Commmon Words in Contents', orientation='h', 
#              width=700, height=700,color='Common_words')
# fig.show()

In [28]:

# plt.subplots(figsize=(20, 10))

# wordcloud = WordCloud (
#                     background_color = 'white',
#                     width = 512,
#                     height = 384
#                         ).generate(' '.join(top_word['Common_words']))
# plt.imshow(wordcloud) # image show
# plt.axis('off') # to off the axis of x and y
# plt.savefig('Plotly-World_Cloud.png')
# plt.show()

## Cleaning and Processing data

In [29]:
def remove_stop_words(content):
    text = ' '.join(word for word in content.split() if word not in stop_words)
    return text

In [30]:
def removing_numbers(content):
    content = ''.join([i for i in content if not i.isdigit()])
    return content

In [31]:
def lower_case(content):
    content = content.split()
    content = [y.lower() for y in content]
    return " " .join(content)

In [32]:
def Removing_punctuations(content):
    ## Remove punctuations
    content = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', content)
    content = content.replace('؛',"", )
    
    ## remove extra whitespace
    content = re.sub('\s+', ' ', content)
    content =  " ".join(content.split())
    return content.strip()

In [33]:
def normalized_sentence(content):
    content = lower_case(content)
    content = Removing_punctuations(content)
    content = remove_stop_words(content)
    content = removing_numbers(content)
    return content

In [34]:
def normalize_text(df):
    df['Content'] = df['Content'].apply(lambda content : lower_case(content))
    df['Content'] = df['Content'].apply(lambda content : remove_stop_words(content))
    df['Content'] = df['Content'].apply(lambda content : Removing_punctuations(content))
    df['Content'] = df['Content'].apply(lambda content : removing_numbers(content))
    return df

In [35]:
#df_small.head()

In [36]:
#df_big.head()

In [37]:
df_small = normalize_text(df_small)
df_big = normalize_text(df_big)

In [38]:
#df_small.head()

In [39]:
# top = Counter([item for sublist in df_big['Content'].apply(lambda x:str(x).split()) for item in sublist])
# print(len(top))
# temp = pd.DataFrame(top.most_common(20))
# temp.columns = ['Common_words','count']
# temp.style.background_gradient(cmap='Blues')

In [40]:
#plot_count_token()

### Conclusion


## Extract feature -> endcode

In [41]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [42]:
# Define a function to generate sentence embeddings
def generate_sentence_embedding(text):
    # Tokenize the text
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    # Generate the embedding
    with torch.no_grad():
        outputs = model(input_ids)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    return embedding

In [43]:
def encode_lables(df):
    encode_lable.fit(df['Topic'])
    df['Topic'] = encode_lable.transform(df['Topic'])  

In [44]:
def embedding_sentences(df):
    df_t = df.copy()
    df_t['Content_embedding'] = df_t['Content'].apply(lambda content : generate_sentence_embedding(content))
    return df_t

In [45]:
def embedding_sentences_Norm(df):
    df_t = df.copy()
    norm = Normalizer()
    df_t['Content_embedding'] = df_t['Content'].apply(lambda content : generate_sentence_embedding(content))
    df_t['Content_embedding'] = df_t['Content_embedding'].apply(lambda x: norm.fit_transform([x])[0])
    return df_t

In [46]:
# encode_lables(df_small)
# embedding_sentences(df_small)

In [47]:
# embedding_sentences_Norm(df_small)
# df_small.head()

In [48]:
# encode_lables(df_big)
# embedding_sentences(df_big)
# Norm(df_big)
# df_big.head()

In [49]:
#df_small['Content_embedding'][1].shape

In [50]:
def plot_content_3d(df):
    X = np.array(df['Content_embedding'].tolist())
    y = np.array(df['Topic'].tolist())

    # Sử dụng t-SNE để giảm chiều dữ liệu từ không gian đa chiều xuống không gian 3D
    tsne = TSNE(n_components=3)
    embeddings = tsne.fit_transform(X)

    # Tạo biểu đồ 3D và phân biệt các lớp
    fig = plt.figure(figsize=(20, 10))
    ax = fig.add_subplot(111, projection='3d')

    # Với mỗi lớp, tô màu các điểm tương ứng trên biểu đồ
    for label in np.unique(y):
        indices = np.where(y == label)
        ax.scatter(embeddings[indices, 0], embeddings[indices, 1], embeddings[indices, 2], label=label)

    # Thêm chú thích cho các lớp
    ax.legend()

    # Hiển thị biểu đồ
    plt.show()

In [51]:
def plot_content_2d(df):
    X = np.array(df['Content_embedding'].tolist())
    y = np.array(df['Topic'].tolist())

    # Sử dụng t-SNE để giảm chiều dữ liệu từ không gian đa chiều xuống không gian 2D
    tsne = TSNE(n_components=2)
    embeddings = tsne.fit_transform(X)

    # Tạo biểu đồ 2D và phân biệt các lớp
    plt.figure(figsize=(20, 10))

    # Với mỗi lớp, tô màu các điểm tương ứng trên biểu đồ
    for label in np.unique(y):
        indices = np.where(y == label)
        plt.scatter(embeddings[indices, 0], embeddings[indices, 1], label=label)

    # Thêm chú thích cho các lớp
    plt.legend()

    # Hiển thị biểu đồ
    plt.show()

In [52]:
##plot_content_2d(df_small)

In [53]:
##plot_content_2d(df_big)

In [54]:
#plot_content_3d(df_small)

In [55]:
#plot_content_3d(df_big)

## Modeling

In [56]:
def get_F1(trained_model,X,y):

    predicted = trained_model.predict(X)
    # Calculate the F1 score for the predictions
    f1 = f1_score(y, predicted, average=None)
    # Return the F1 score
    return f1

In [57]:
def eveluation_model(model, X_test, y_test):
    
    #test the model with the test data
    y_pred = model.predict(X_test)
    #calculate the accuracy
    log_reg_accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: ', log_reg_accuracy,'\n')

    #calculate the F1 score
    f1_Score = get_F1(model,X_test,y_test)
    return pd.DataFrame(f1_Score, index= df_small['Topic'].unique(), columns=['F1 score'])

In [58]:
def training_model(model, df):
    X = df["Content_embedding"].values
    X = np.vstack(X)
    y = df["Topic"].values
    X_train,X_test,y_train,y_test = train_test_split(X , y, test_size = 0.3, random_state = 6, stratify = y)
    # cross_val_score(classifier, X_train, y_train, scoring = 'accuracy', cv =10).mean()
    model.fit(X_train, y_train)
    eveluation = eveluation_model(model, X_test ,y_test)
    return eveluation

In [59]:
def training_model_repeat_holdout(model, df):
    X = df["Content_embedding"].values
    X = np.vstack(X)
    y = df["Topic"].values
    acc = []
    f1 = []
    recall = []
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i,stratify=y)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc.append(accuracy_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred, average='macro'))
        recall.append(recall_score(y_test, y_pred, average='macro'))
    return np.mean(acc), np.mean(recall), np.mean(f1)


In [60]:
def training_model_cross_val(model, df):
    scoringopt = {'accuracy' : make_scorer(accuracy_score),
              'recall': make_scorer(recall_score, average='macro'),
              'f1_score': make_scorer(f1_score, average='macro')}
    list = []
    X = df["Content_embedding"].values
    X = np.vstack(X)
    y = df["Topic"].values
    cv_results = cross_validate(model, X, y, cv=5, scoring=scoringopt)
    return np.mean(cv_results['test_accuracy']), np.mean(cv_results['test_recall']), np.mean(cv_results['test_f1_score'])

In [61]:
model_SVC = SVC(kernel='linear', probability=True)
model_RandomForest = RandomForestClassifier(n_estimators=100, random_state=42)
data = [df_small]
norm = [embedding_sentences, embedding_sentences_Norm]
md = [model_SVC, model_RandomForest]
evaluation = [training_model_repeat_holdout, training_model_cross_val]
encode_lables(df_small)
encode_lables(df_big)
result_list = []
for i in data:
    for j in norm:
        df_t = j(i)
        for k in md:
            for l in evaluation:
                acc, recall, f1 = l(k,df_t)
                result_dict = {
                'Dataset':  type(i).__name__,
                'Normalize': j.__name__,
                'Model': type(k).__name__,
                'evaluation' : l.__name__,
                'accuracy' : acc,
                'recall' : recall,
                'f1' : f1
                }
                result_list.append(result_dict)
rs = pd.DataFrame(result_list)
rs

Unnamed: 0,Dataset,Normalize,Model,evaluation,accuracy,recall,f1
0,DataFrame,embedding_sentences,SVC,training_model_repeat_holdout,0.938257,0.918937,0.925011
1,DataFrame,embedding_sentences,SVC,training_model_cross_val,0.944727,0.924442,0.928273
2,DataFrame,embedding_sentences,RandomForestClassifier,training_model_repeat_holdout,0.935109,0.904388,0.922613
3,DataFrame,embedding_sentences,RandomForestClassifier,training_model_cross_val,0.952,0.92652,0.94328
4,DataFrame,embedding_sentences_Norm,SVC,training_model_repeat_holdout,0.844552,0.767694,0.79875
5,DataFrame,embedding_sentences_Norm,SVC,training_model_cross_val,0.864727,0.799657,0.829067
6,DataFrame,embedding_sentences_Norm,RandomForestClassifier,training_model_repeat_holdout,0.929782,0.89961,0.917705
7,DataFrame,embedding_sentences_Norm,RandomForestClassifier,training_model_cross_val,0.947636,0.923687,0.939737


In [62]:
# training_model_cross_val(SVC(kernel='linear', probability=True), df_small)

In [63]:
# training_model(SVC(kernel='linear', probability=True), df_big)

In [64]:
# training_model_cross_val(RandomForestClassifier(n_estimators=100, random_state=42), df_small)

In [65]:
# training_model(RandomForestClassifier(n_estimators=100, random_state=42), df_big)