# header

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

import numpy as np
import tensorflow as tf
import torch
import os
from tqdm import tqdm

2023-10-23 16:08:36.504737: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-23 16:08:36.901269: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# data conversion

In [2]:
file_paths = [
    "./VLSP2018-SA-train-dev-test/1-VLSP2018-SA-Hotel-train (7-3-2018).txt",
    "./VLSP2018-SA-train-dev-test/2-VLSP2018-SA-Hotel-dev (7-3-2018).txt",
    "./VLSP2018-SA-train-dev-test/3-VLSP2018-SA-Hotel-test (8-3-2018).txt"
]

file_types = ['train', 'dev', 'test']

In [3]:
def create_sentiment_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        line_number = 1
        data_list = []

        text = None
        aspects = []
        polarities = []

        for line in tqdm(file):
            if line == '\n':
                if text and aspects:
                    for aspect, polarity in zip(aspects, polarities):
                        data_list.append({'text': text, 'aspect_cat': aspect, 'polarity': polarity})
                text = None
                aspects = []
                polarities = []
                line_number = 1
                continue

            if line_number % 2 == 0:
                text = line.strip()

            if line_number % 3 == 0:
                cat_sen = re.findall(r'{(.*?), (.*?)}', line)
                for cat, sen in cat_sen:
                    aspects.append(cat.strip())
                    polarity = sen.strip()
                    polarities.append(polarity)

            line_number += 1
        
        # squeeze out the last data
        if text and aspects:
            for aspect, polarity in zip(aspects, polarities):
                data_list.append({'text': text, 'aspect_cat': aspect, 'polarity': polarity})


        df = pd.DataFrame(data_list)
        df.fillna(0, inplace=True)
        return df

In [1]:
column_names = [
'FACILITIES#CLEANLINESS',
 'FACILITIES#COMFORT',
 'FACILITIES#DESIGN&FEATURES',
 'FACILITIES#GENERAL',
 'FACILITIES#MISCELLANEOUS',
 'FACILITIES#PRICES',
 'FACILITIES#QUALITY',
 'FOOD&DRINKS#MISCELLANEOUS',
 'FOOD&DRINKS#PRICES',
 'FOOD&DRINKS#QUALITY',
 'FOOD&DRINKS#STYLE&OPTIONS',
 'HOTEL#CLEANLINESS',
 'HOTEL#COMFORT',
 'HOTEL#DESIGN&FEATURES',
 'HOTEL#GENERAL',
 'HOTEL#MISCELLANEOUS',
 'HOTEL#PRICES',
 'HOTEL#QUALITY',
 'LOCATION#GENERAL',
 'ROOMS#CLEANLINESS',
 'ROOMS#COMFORT',
 'ROOMS#DESIGN&FEATURES',
 'ROOMS#GENERAL',
 'ROOMS#MISCELLANEOUS',
 'ROOMS#PRICES',
 'ROOMS#QUALITY',
 'ROOM_AMENITIES#CLEANLINESS',
 'ROOM_AMENITIES#COMFORT',
 'ROOM_AMENITIES#DESIGN&FEATURES',
 'ROOM_AMENITIES#GENERAL',
 'ROOM_AMENITIES#MISCELLANEOUS',
 'ROOM_AMENITIES#PRICES',
 'ROOM_AMENITIES#QUALITY',
 'SERVICE#GENERAL']

In [2]:
len(column_names)

34

In [4]:
for value in column_names:
    print("{{{value}, positive}}".format(value=value), end=' ')


{FACILITIES#CLEANLINESS, positive} {FACILITIES#COMFORT, positive} {FACILITIES#DESIGN&FEATURES, positive} {FACILITIES#GENERAL, positive} {FACILITIES#MISCELLANEOUS, positive} {FACILITIES#PRICES, positive} {FACILITIES#QUALITY, positive} {FOOD&DRINKS#MISCELLANEOUS, positive} {FOOD&DRINKS#PRICES, positive} {FOOD&DRINKS#QUALITY, positive} {FOOD&DRINKS#STYLE&OPTIONS, positive} {HOTEL#CLEANLINESS, positive} {HOTEL#COMFORT, positive} {HOTEL#DESIGN&FEATURES, positive} {HOTEL#GENERAL, positive} {HOTEL#MISCELLANEOUS, positive} {HOTEL#PRICES, positive} {HOTEL#QUALITY, positive} {LOCATION#GENERAL, positive} {ROOMS#CLEANLINESS, positive} {ROOMS#COMFORT, positive} {ROOMS#DESIGN&FEATURES, positive} {ROOMS#GENERAL, positive} {ROOMS#MISCELLANEOUS, positive} {ROOMS#PRICES, positive} {ROOMS#QUALITY, positive} {ROOM_AMENITIES#CLEANLINESS, positive} {ROOM_AMENITIES#COMFORT, positive} {ROOM_AMENITIES#DESIGN&FEATURES, positive} {ROOM_AMENITIES#GENERAL, positive} {ROOM_AMENITIES#MISCELLANEOUS, positive} {ROOM_A

In [6]:
column_dict = {column: 0.0 for column in column_names}
column_dict

{'FACILITIES#CLEANLINESS': 0.0,
 'FACILITIES#COMFORT': 0.0,
 'FACILITIES#DESIGN&FEATURES': 0.0,
 'FACILITIES#GENERAL': 0.0,
 'FACILITIES#MISCELLANEOUS': 0.0,
 'FACILITIES#PRICES': 0.0,
 'FACILITIES#QUALITY': 0.0,
 'FOOD&DRINKS#MISCELLANEOUS': 0.0,
 'FOOD&DRINKS#PRICES': 0.0,
 'FOOD&DRINKS#QUALITY': 0.0,
 'FOOD&DRINKS#STYLE&OPTIONS': 0.0,
 'HOTEL#CLEANLINESS': 0.0,
 'HOTEL#COMFORT': 0.0,
 'HOTEL#DESIGN&FEATURES': 0.0,
 'HOTEL#GENERAL': 0.0,
 'HOTEL#MISCELLANEOUS': 0.0,
 'HOTEL#PRICES': 0.0,
 'HOTEL#QUALITY': 0.0,
 'LOCATION#GENERAL': 0.0,
 'ROOMS#CLEANLINESS': 0.0,
 'ROOMS#COMFORT': 0.0,
 'ROOMS#DESIGN&FEATURES': 0.0,
 'ROOMS#GENERAL': 0.0,
 'ROOMS#MISCELLANEOUS': 0.0,
 'ROOMS#PRICES': 0.0,
 'ROOMS#QUALITY': 0.0,
 'ROOM_AMENITIES#CLEANLINESS': 0.0,
 'ROOM_AMENITIES#COMFORT': 0.0,
 'ROOM_AMENITIES#DESIGN&FEATURES': 0.0,
 'ROOM_AMENITIES#GENERAL': 0.0,
 'ROOM_AMENITIES#MISCELLANEOUS': 0.0,
 'ROOM_AMENITIES#PRICES': 0.0,
 'ROOM_AMENITIES#QUALITY': 0.0,
 'SERVICE#GENERAL': 0.0}

In [7]:
def create_aspect_data(file_path):
    from collections import OrderedDict
    
    with open(file_path, 'r', encoding='utf-8') as file:
        text = None
        # copy dict
        dup = dict(column_dict)
        flag = True
        data_list = []
        for line in tqdm(file):

            line = line.strip()
            
            if line.startswith('#'):
                continue
            elif line.startswith('{'):
                cat_sen = re.findall(r'{(.*?), (.*?)}', line)
                for cat, sen in cat_sen:
                    if sen.strip() == "negative":
                        dup[cat] = 1.0
                    elif sen.strip() == "neutral":
                        dup[cat] = 1.0
                    elif sen.strip() == "positive":
                        dup[cat] = 1.0
                    else:
                        dup[cat] = 0
                flag = False
            else:
                text = line
            
            if text is not None and not flag:
                tmp_dict = OrderedDict([('text', text)] + list(dup.items()))
                data_list.append(tmp_dict)
                dup = dict(column_dict)
                tmp_dict = None
                text = None 
                flag = True
                
    df = pd.DataFrame(data_list)
    return df

In [8]:
if not os.path.exists("sentiment_data"):
    os.mkdir("sentiment_data")
for idx, file_path in enumerate(file_paths):
    df = create_sentiment_data(file_path)
    df.to_csv('./sentiment_data/{}_hotel_reviews.csv'.format(file_types[idx]), index=False)

12003it [00:00, 763138.26it/s]
7999it [00:00, 1123697.55it/s]
2399it [00:00, 956294.93it/s]


In [9]:
if not os.path.exists("aspect_data"):
    os.mkdir("aspect_data")
for idx, file_path in enumerate(file_paths):
    df = create_aspect_data(file_path)
    df.to_csv('./aspect_data/{}_hotel_reviews.csv'.format(file_types[idx]), index=False)

12003it [00:00, 530184.83it/s]
7999it [00:00, 609505.64it/s]
2399it [00:00, 567520.32it/s]


In [10]:
tf.config.list_physical_devices('GPU')

2023-10-23 09:54:00.021814: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-23 09:54:00.022535: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-23 09:54:00.022610: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# load datasets

In [5]:
df_train = pd.read_csv('./sentiment_data/train_hotel_reviews.csv')
df_val = pd.read_csv('./sentiment_data/dev_hotel_reviews.csv')
df_test = pd.read_csv('./sentiment_data/test_hotel_reviews.csv')

df_aspect_train = pd.read_csv('./aspect_data/train_hotel_reviews.csv')
df_aspect_val = pd.read_csv('./aspect_data/dev_hotel_reviews.csv')
df_aspect_test = pd.read_csv('./aspect_data/test_hotel_reviews.csv')

df_train.tail()

Unnamed: 0,text,aspect_cat,polarity
13949,"Bữa sáng không thay đổi, không có TV, không dọ...",FOOD&DRINKS#STYLE&OPTIONS,negative
13950,"Bữa sáng không thay đổi, không có TV, không dọ...",ROOM_AMENITIES#DESIGN&FEATURES,negative
13951,"Bữa sáng không thay đổi, không có TV, không dọ...",SERVICE#GENERAL,negative
13952,"Bữa sáng không thay đổi, không có TV, không dọ...",ROOMS#CLEANLINESS,negative
13953,"Bữa sáng không thay đổi, không có TV, không dọ...",ROOM_AMENITIES#PRICES,negative


In [6]:
df_aspect_train.head()

Unnamed: 0,text,FACILITIES#CLEANLINESS,FACILITIES#COMFORT,FACILITIES#DESIGN&FEATURES,FACILITIES#GENERAL,FACILITIES#MISCELLANEOUS,FACILITIES#PRICES,FACILITIES#QUALITY,FOOD&DRINKS#MISCELLANEOUS,FOOD&DRINKS#PRICES,...,ROOMS#PRICES,ROOMS#QUALITY,ROOM_AMENITIES#CLEANLINESS,ROOM_AMENITIES#COMFORT,ROOM_AMENITIES#DESIGN&FEATURES,ROOM_AMENITIES#GENERAL,ROOM_AMENITIES#MISCELLANEOUS,ROOM_AMENITIES#PRICES,ROOM_AMENITIES#QUALITY,SERVICE#GENERAL
0,Rộng rãi KS mới nhưng rất vắng. Các dịch vụ ch...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Địa điểm thuận tiện, trong vòng bán kính 1,5km...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Phục vụ, view đẹp, vị trí",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,"thuận tiện , sạch sẽ , vui vẻ hài lòng",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Vị trí đẹp; Có quán bar view đẹp; Nhân viên th...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
df_train.describe()

Unnamed: 0,text,aspect_cat,polarity
count,13954,13954,13954
unique,2949,34,3
top,Gia đình tôi rất hài lòng khi ở tại khách sạn....,SERVICE#GENERAL,positive
freq,44,1913,10229


In [14]:
df_train.text.nunique(), df_train.aspect_cat.nunique(), df_train.polarity.nunique()

(2949, 34, 3)

In [9]:
df_train.aspect_cat.value_counts()

aspect_cat
SERVICE#GENERAL                   1913
HOTEL#GENERAL                     1282
LOCATION#GENERAL                  1196
HOTEL#COMFORT                     1139
ROOMS#DESIGN&FEATURES              916
HOTEL#DESIGN&FEATURES              877
FOOD&DRINKS#QUALITY                672
ROOMS#CLEANLINESS                  659
FOOD&DRINKS#STYLE&OPTIONS          570
FACILITIES#DESIGN&FEATURES         525
HOTEL#PRICES                       496
ROOMS#COMFORT                      434
ROOM_AMENITIES#DESIGN&FEATURES     355
HOTEL#CLEANLINESS                  348
ROOMS#GENERAL                      256
HOTEL#QUALITY                      246
ROOM_AMENITIES#QUALITY             239
FACILITIES#GENERAL                 219
ROOM_AMENITIES#GENERAL             216
FACILITIES#QUALITY                 208
ROOMS#PRICES                       196
FACILITIES#CLEANLINESS             172
ROOMS#QUALITY                      164
FACILITIES#COMFORT                 132
FOOD&DRINKS#PRICES                 118
HOTEL#MISCELLA

In [15]:
df_train.polarity.value_counts()

polarity
positive    10229
negative     3155
neutral       570
Name: count, dtype: int64

In [16]:
df_test.text.iloc[1584]

'Nội thất phòng đầy đủ và được chuẩn bị tốt. Có dọn phòng hàng ngày. Nhân viên dọn dẹp và lễ tân nhiệt tình. Vị trí gần biển. Phòng hơi nhỏ. Khách sạn có tầm nhìn ra không được đẹp lắm. Tiền thuê xe do khách san liên hệ hơi mắc và không có nhiều xe để chọn. Nước chảy yếu ở vòi sen. Buổi sáng hơi ít món.'

# preprocessing

In [17]:
"hiếu" == "hiếu", "hiếu" == "hiêú"

(True, False)

In [18]:
# source code: https://colab.research.google.com/github/nguyenvanhieuvn/text-classification-tutorial/blob/master/text_classification_tutorial.ipynb#scrollTo=Koy7eu1dMwxn
import regex as re

uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"

def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic
dicchar = loaddicchar()

# Hàm chuyển Unicode dựng sẵn về Unicde tổ hợp (phổ biến hơn)
def convert_unicode(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids = {}

for i in range(len(bang_nguyen_am)):
    for j in range(len(bang_nguyen_am[i]) - 1):
        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)

def chuan_hoa_dau_tu_tieng_viet(word):
    if not is_valid_vietnam_word(word):
        return word

    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1:
            continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True
        if y != 0:
            dau_cau = y
            chars[index] = bang_nguyen_am[x][0]
        if not qu_or_gi or index != 1:
            nguyen_am_index.append(index)
    if len(nguyen_am_index) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = nguyen_am_to_ids.get(chars[1])
                chars[1] = bang_nguyen_am[x][dau_cau]
            else:
                x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
                if x != -1:
                    chars[2] = bang_nguyen_am[x][dau_cau]
                else:
                    chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
            return ''.join(chars)
        return word

    for index in nguyen_am_index:
        x, y = nguyen_am_to_ids[chars[index]]
        if x == 4 or x == 8:  # ê, ơ
            chars[index] = bang_nguyen_am[x][dau_cau]
            # for index2 in nguyen_am_index:
            #     if index2 != index:
            #         x, y = nguyen_am_to_ids[chars[index]]
            #         chars[index2] = bang_nguyen_am[x][0]
            return ''.join(chars)

    if len(nguyen_am_index) == 2:
        if nguyen_am_index[-1] == len(chars) - 1:
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
        else:
            # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
            # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
            x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
            chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
    else:
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
        # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
        x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
        chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
        # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
        # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
    return ''.join(chars)


def is_valid_vietnam_word(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True


def chuan_hoa_dau_cau_tieng_viet(sentence):
    """
        Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
        :param sentence:
        :return:
        """
    sentence = sentence.lower()
    words = sentence.split()
    for index, word in enumerate(words):
        cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/')
        # print(cw)
        if len(cw) == 3:
            cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1])
        words[index] = ''.join(cw)
    return ' '.join(words)

def remove_html(txt):
    return re.sub(r'<[^>]*>', '', txt)

In [19]:
!wget https://gist.githubusercontent.com/nguyenvanhieuvn/7d9441c10b3c2739499fc5a4d9ea06fb/raw/df939245b3e841b62af115be4dcb3516dadc9fc5/teencode.txt

--2023-10-23 09:54:00--  https://gist.githubusercontent.com/nguyenvanhieuvn/7d9441c10b3c2739499fc5a4d9ea06fb/raw/df939245b3e841b62af115be4dcb3516dadc9fc5/teencode.txt
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5656 (5,5K) [text/plain]
Saving to: ‘teencode.txt’


2023-10-23 09:54:00 (6,58 MB/s) - ‘teencode.txt’ saved [5656/5656]



In [20]:
import emoji
replace_list = {
    'ô kêi': 'ok', 'okie': 'ok', 'o kê': 'ok', 'okey': 'ok', 'ôkê': 'ok', 'oki': 'ok', 'oke': 'ok', 'okay': 'ok', 'okê': 'ok',
    'tks': 'cảm ơn', 'thks': 'cảm ơn', 'thanks': 'cảm ơn', 'ths': 'cảm ơn', 'thank': 'cảm ơn',
    'kg': 'không', 'not': 'không', 'k': 'không', 'kh': 'không', 'kô': 'không', 'hok': 'không', 'ko': 'không', 'khong': 'không', 'kp': 'không phải',
    'he he': 'tích cực', 'hehe': 'tích cực', 'hihi': 'tích cực', 'haha': 'tích cực', 'hjhj': 'tích cực', 'thick': 'tích cực',
    'lol': 'tiêu cực', 'cc': 'tiêu cực', 'huhu': 'tiêu cực', 'cute': 'dễ thương',
     
    'sz': 'cỡ', 'size': 'cỡ', 
    'wa': 'quá', 'wá': 'quá', 'qá': 'quá', 
    'đx': 'được', 'dk': 'được', 'dc': 'được', 'đk': 'được', 'đc': 'được', 
    'vs': 'với', 'j': 'gì', '“': ' ', 'time': 'thời gian', 'm': 'mình', 'mik': 'mình', 'r': 'rồi', 'bjo': 'bao giờ', 'very': 'rất',

    'authentic': 'chuẩn chính hãng', 'aut': 'chuẩn chính hãng', 'auth': 'chuẩn chính hãng', 'date': 'hạn sử dụng', 'hsd': 'hạn sử dụng', 
    'store': 'cửa hàng', 'sop': 'cửa hàng', 'shopE': 'cửa hàng', 'shop': 'cửa hàng', 
    'sp': 'sản phẩm', 'product': 'sản phẩm', 'hàg': 'hàng', 
    'ship': 'giao hàng', 'delivery': 'giao hàng', 'síp': 'giao hàng', 'order': 'đặt hàng',

    'gud': 'tốt', 'wel done': 'tốt', 'good': 'tốt', 'gút': 'tốt', 'tot': 'tốt', 'nice': 'tốt', 'perfect': 'rất tốt', 
    'quality': 'chất lượng', 'chất lg': 'chất lượng', 'chat': 'chất', 'excelent': 'hoàn hảo', 'bt': 'bình thường',
    'sad': 'tệ', 'por': 'tệ', 'poor': 'tệ', 'bad': 'tệ', 
    'beautiful': 'đẹp tuyệt vời', 'dep': 'đẹp', 
    'xau': 'xấu', 'sấu': 'xấu', 
     
    'thik': 'thích', 'iu': 'yêu', 'fake': 'giả mạo', 
    'quickly': 'nhanh', 'quick': 'nhanh', 'fast': 'nhanh',
    'fresh': 'tươi', 'delicious': 'ngon',

    'dt': 'điện thoại', 'fb': 'facebook', 'face': 'facebook', 'ks': 'khách sạn', 'nv': 'nhân viên',
    'nt': 'nhắn tin', 'ib': 'nhắn tin', 'tl': 'trả lời', 'trl': 'trả lời', 'rep': 'trả lời',
    'fback': 'feedback', 'fedback': 'feedback',
    'sd': 'sử dụng', 'sài': 'xài', 

    '^_^': 'tích cực', ':)': 'tích cực', ':(': 'tiêu cực',
    '❤️': 'tích cực', '👍': 'tích cực', '🎉': 'tích cực', '😀': 'tích cực', '😍': 'tích cực', '😂': 'tích cực', '🤗': 'tích cực', '😙': 'tích cực', '🙂': 'tích cực', 
    '😔': 'tiêu cực', '😓': 'tiêu cực', 
    '⭐': 'star', '*': 'star', '🌟': 'star',
}

with open('teencode.txt', encoding='utf-8') as f:
    for pair in f.readlines():
        key, value = pair.split('\t')
        replace_list[key] = value.strip()


def normalize_acronyms(text):
    words = []
    for word in text.strip().split():
        # word = word.strip(string.punctuation)
        if word.lower() not in replace_list.keys(): words.append(word)
        else: words.append(replace_list[word.lower()])
    return emoji.demojize(' '.join(words)) # Remove Emojis


In [21]:
sample = "Nhân viên nhiệt tình Phòng sạch sẽ Nếu có dịp sẽ quay lại 👍👍"

In [22]:
def insert_space_between_emojis(sentence):
    result = []
    for i in range(len(sentence) - 1):
        result.append(sentence[i])
        if emoji.emoji_count(sentence[i:i+2]) == 2:
            result.append(' ')
    result.append(sentence[-1])
    return ''.join(result)

In [23]:
normalize_acronyms(sample)

'Nhân viên nhiệt tình Phòng sạch sẽ Nếu có dịp sẽ quay lại :thumbs_up::thumbs_up:'

In [24]:
normalize_acronyms(insert_space_between_emojis(sample))

'Nhân viên nhiệt tình Phòng sạch sẽ Nếu có dịp sẽ quay lại tích cực tích cực'

In [25]:
normalize_acronyms(insert_space_between_emojis("Nhân viên nhiệt tình Phòng sạch sẽ Nếu có dịp sẽ quay lại👍👍"))

'Nhân viên nhiệt tình Phòng sạch sẽ Nếu có dịp sẽ quay lại:thumbs_up: tích cực'

In [26]:
# https://github.com/trungtv/pyvi
from pyvi import ViTokenizer, ViPosTagger

In [27]:
!wget https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords-dash.txt

--2023-10-23 09:54:00--  https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords-dash.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20475 (20K) [text/plain]
Saving to: ‘vietnamese-stopwords-dash.txt’


2023-10-23 09:54:01 (3,57 MB/s) - ‘vietnamese-stopwords-dash.txt’ saved [20475/20475]



In [28]:
def remove_stop_words(text):
    with open('./vietnamese-stopwords-dash.txt', encoding='utf-8') as f:
        stopwords = set([w.strip()for w in f])
        
    words = text.split()
    filtered_words = [w for w in words if w not in stopwords]
    
    return ' '.join(filtered_words)

In [29]:
def text_preprocess(document):
    document = normalize_acronyms(insert_space_between_emojis(document))
    # xóa html code
    document = remove_html(document)
    # chuẩn hóa unicode
    document = convert_unicode(document)
    # chuẩn hóa cách gõ dấu tiếng Việt
    document = chuan_hoa_dau_cau_tieng_viet(document)
    # tách từ
    document = ViTokenizer.tokenize(document)
    # đưa về lower
    document = document.lower()
    # xóa các ký tự không cần thiết
    document = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]',' ',document)
    # xóa khoảng trắng thừa
    document = re.sub(r'\s+', ' ', document).strip()
    return document

In [30]:
sample = text_preprocess(sample)
sample

'nhân_viên nhiệt_tình phòng sạch_sẽ nếu có dịp sẽ quay lại tích_cực tích_cực'

In [31]:
tqdm.pandas()
df_train['cleaned_text'] = df_train.text.progress_apply(text_preprocess)
df_val['cleaned_text'] = df_val.text.progress_apply(text_preprocess)
df_test['cleaned_text'] = df_test.text.progress_apply(text_preprocess)

100%|██████████| 13954/13954 [00:17<00:00, 806.46it/s] 
100%|██████████| 7111/7111 [00:05<00:00, 1390.05it/s]
100%|██████████| 2584/2584 [00:02<00:00, 1211.92it/s]


In [32]:
tqdm.pandas()
df_aspect_train['cleaned_text'] = df_aspect_train.text.progress_apply(text_preprocess)
df_aspect_val['cleaned_text'] = df_aspect_val.text.progress_apply(text_preprocess)
df_aspect_test['cleaned_text'] = df_aspect_test.text.progress_apply(text_preprocess)

100%|██████████| 3001/3001 [00:03<00:00, 971.81it/s] 
100%|██████████| 2000/2000 [00:01<00:00, 1837.31it/s]
100%|██████████| 600/600 [00:00<00:00, 1475.54it/s]


In [33]:
df_train.cleaned_text.iloc[13]

'co view huong ho tay sach se nhan vien tan tinh'

# aspect term extraction

In [34]:
def extract_aspect_terms(text):
    text = re.sub(r'[^\w\s]', '', text)
    doc = ViPosTagger.postagging(text)
    aspect_terms = []
    #N - Common noun Nc - Noun Classifier Ny - Noun abbreviation Nu - Unit noun Np - Proper noun X - Unknown
    aspect_pos_patterns = ["N", "Np", "Nc", "Ny", "X", "V", "A"]
    
    for idx, pos_tag in enumerate(doc[1]):
        if pos_tag in aspect_pos_patterns:
            aspect_terms.append(doc[0][idx])
            
    return ' '.join(aspect_terms)

In [35]:
extract_aspect_terms(sample)

'nhân_viên nhiệt_tình phòng sạch_sẽ có dịp quay tích_cực tích_cực'

In [36]:
tqdm.pandas()
df_aspect_train['aspect_term'] = df_aspect_train.cleaned_text.progress_apply(extract_aspect_terms)
df_aspect_val['aspect_term'] = df_aspect_val.cleaned_text.progress_apply(extract_aspect_terms)
df_aspect_test['aspect_term'] = df_aspect_test.cleaned_text.progress_apply(extract_aspect_terms)

100%|██████████| 3001/3001 [00:00<00:00, 3237.33it/s]
100%|██████████| 2000/2000 [00:00<00:00, 6645.48it/s]
100%|██████████| 600/600 [00:00<00:00, 5129.85it/s]


# aspect category detection

In [37]:
text = "Phòng đẹp , sạch sẽ và gần biển . Nhân viên dễ thương . Phòng không có view biển như lúc đặt , Tv hay mất cáp ."
correct_aspect_cat = "ROOMS#DESIGN&FEATURES, ROOMS#CLEANLINESS, LOCATION#GENERAL, SERVICE#GENERAL, ROOMS#GENERAL, ROOM_AMENITIES#DESIGN&FEATURES"

In [38]:
def print_column_names(binary_list, column_names):
    for i, value in enumerate(binary_list):
        if value == 1:
            print(column_names[i])

In [39]:
import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from scipy.sparse import hstack
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

In [40]:
from sklearn.metrics import hamming_loss, accuracy_score, precision_score, recall_score, f1_score

def multi_label_metrics(true_labels, predicted_labels):
    # Calculate Hamming loss
    hamming_loss_value = hamming_loss(true_labels, predicted_labels)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)

    # Calculate precision, recall, and F1-score using micro and macro averaging
    precision_micro = precision_score(true_labels, predicted_labels, average='micro')
    recall_micro = recall_score(true_labels, predicted_labels, average='micro')
    f1_micro = f1_score(true_labels, predicted_labels, average='micro')

    precision_macro = precision_score(true_labels, predicted_labels, average='macro')
    recall_macro = recall_score(true_labels, predicted_labels, average='macro')
    f1_macro = f1_score(true_labels, predicted_labels, average='macro')

    metrics = {
        "Hamming Loss": hamming_loss_value,
        "Accuracy": accuracy,
        "Precision (Micro)": precision_micro,
        "Recall (Micro)": recall_micro,
        "F1 Score (Micro)": f1_micro,
        "Precision (Macro)": precision_macro,
        "Recall (Macro)": recall_macro,
        "F1 Score (Macro)": f1_macro
    }

    return metrics

## base

In [41]:
df_aspect_train_base = df_aspect_train.drop(columns=['text', 'cleaned_text'])
df_aspect_val_base = df_aspect_val.drop(columns=['text', 'cleaned_text'])
df_aspect_test_base = df_aspect_test.drop(columns=['text', 'cleaned_text'])

In [42]:
df_aspect_train_base.head()

Unnamed: 0,FACILITIES#CLEANLINESS,FACILITIES#COMFORT,FACILITIES#DESIGN&FEATURES,FACILITIES#GENERAL,FACILITIES#MISCELLANEOUS,FACILITIES#PRICES,FACILITIES#QUALITY,FOOD&DRINKS#MISCELLANEOUS,FOOD&DRINKS#PRICES,FOOD&DRINKS#QUALITY,...,ROOMS#QUALITY,ROOM_AMENITIES#CLEANLINESS,ROOM_AMENITIES#COMFORT,ROOM_AMENITIES#DESIGN&FEATURES,ROOM_AMENITIES#GENERAL,ROOM_AMENITIES#MISCELLANEOUS,ROOM_AMENITIES#PRICES,ROOM_AMENITIES#QUALITY,SERVICE#GENERAL,aspect_term
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rộng_rãi khách_sạn vắng dịch_vụ chất_lượng cao...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,địa_điểm thuận_tiện vòng bán_kính nhiều quán ă...
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,phục_vụ view đẹp vị_trí
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,thuận_tiện sạch_sẽ vui_vẻ hài_lòng
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,vị_trí đẹp có quán bar view đẹp nhân_viên thân...


In [43]:
x_aspect_train_base = df_aspect_train_base.aspect_term
y_aspect_train = df_aspect_train_base.drop(columns=['aspect_term'])

x_aspect_val_base = df_aspect_val_base.aspect_term
y_aspect_val = df_aspect_val_base.drop(columns=['aspect_term'])

x_aspect_test_base = df_aspect_test_base.aspect_term
y_aspect_test = df_aspect_test_base.drop(columns=['aspect_term'])

x_aspect_train_base.shape, y_aspect_train.shape, x_aspect_val_base.shape, y_aspect_val.shape, x_aspect_test_base.shape, y_aspect_test.shape

((3001,), (3001, 34), (2000,), (2000, 34), (600,), (600, 34))

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
vectorizer_base = TfidfVectorizer()
x_aspect_train_vect = vectorizer_base.fit_transform(x_aspect_train_base)
x_aspect_val_vect = vectorizer_base.transform(x_aspect_val_base)
x_aspect_test_vect = vectorizer_base.transform(x_aspect_test_base)
test_text = vectorizer_base.transform([extract_aspect_terms(text_preprocess(text))])

x_aspect_train_vect.shape

(3001, 5560)

In [46]:
clf_base = MultiOutputClassifier(LogisticRegression()).fit(x_aspect_train_vect, y_aspect_train)

In [47]:
train_res = multi_label_metrics(y_aspect_train, clf_base.predict(x_aspect_train_vect))
val_res = multi_label_metrics(y_aspect_val, clf_base.predict(x_aspect_val_vect))
test_res = multi_label_metrics(y_aspect_test, clf_base.predict(x_aspect_test_vect))

print("train")
for metric, value in train_res.items():
    print(f"{metric}: {value}")

print("\n\n validation")
for metric, value in val_res.items():
    print(f"{metric}: {value}")

train
Hamming Loss: 0.06738930160534724
Accuracy: 0.157947350883039
Precision (Micro): 0.9303174024078803
Recall (Micro): 0.5482691894216297
F1 Score (Micro): 0.6899350649350648
Precision (Macro): 0.6749370864842208
Recall (Macro): 0.2508173063350151
F1 Score (Macro): 0.3162240548105005


 validation
Hamming Loss: 0.06551470588235295
Accuracy: 0.1405
Precision (Micro): 0.828875681030213
Recall (Micro): 0.4706792293629588
F1 Score (Micro): 0.6004125930576734
Precision (Macro): 0.4356671256421353
Recall (Macro): 0.19348742609514696
F1 Score (Macro): 0.23804076933121465


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
print("\n test") 
for metric, value in test_res.items():
    print(f"{metric}: {value}")


 test
Hamming Loss: 0.07916666666666666
Accuracy: 0.08
Precision (Micro): 0.8419195483415667
Recall (Micro): 0.461687306501548
F1 Score (Micro): 0.5963509122719319
Precision (Macro): 0.4958716621846244
Recall (Macro): 0.20046229614435584
F1 Score (Macro): 0.2478286742413716


In [49]:
print_column_names(clf_base.predict(test_text)[0], column_names)
print("\n", correct_aspect_cat)

LOCATION#GENERAL
ROOMS#CLEANLINESS
ROOMS#DESIGN&FEATURES
SERVICE#GENERAL

 ROOMS#DESIGN&FEATURES, ROOMS#CLEANLINESS, LOCATION#GENERAL, SERVICE#GENERAL, ROOMS#GENERAL, ROOM_AMENITIES#DESIGN&FEATURES


In [50]:
from sklearn.metrics import classification_report

print(classification_report(y_aspect_test, clf_base.predict(x_aspect_test_vect), target_names=column_names))

                                precision    recall  f1-score   support

        FACILITIES#CLEANLINESS       1.00      0.20      0.33         5
            FACILITIES#COMFORT       0.00      0.00      0.00        26
    FACILITIES#DESIGN&FEATURES       1.00      0.11      0.19        65
            FACILITIES#GENERAL       0.00      0.00      0.00        21
      FACILITIES#MISCELLANEOUS       0.00      0.00      0.00         8
             FACILITIES#PRICES       0.00      0.00      0.00        13
            FACILITIES#QUALITY       1.00      0.08      0.15        51
     FOOD&DRINKS#MISCELLANEOUS       0.00      0.00      0.00         3
            FOOD&DRINKS#PRICES       0.00      0.00      0.00         9
           FOOD&DRINKS#QUALITY       0.83      0.74      0.78       129
     FOOD&DRINKS#STYLE&OPTIONS       0.80      0.59      0.68       124
             HOTEL#CLEANLINESS       0.92      0.16      0.28        67
                 HOTEL#COMFORT       0.64      0.41      0.50  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## test 1: use cleaned text instead of aspect term

In [51]:
df_aspect_train_test1 = df_aspect_train.drop(columns=['text', 'aspect_term'])
df_aspect_val_test1 = df_aspect_val.drop(columns=['text', 'aspect_term'])
df_aspect_test_test1 = df_aspect_test.drop(columns=['text', 'aspect_term'])

In [52]:
x_aspect_train_test1 = df_aspect_train_test1.cleaned_text

x_aspect_val_test1 = df_aspect_val_test1.cleaned_text

x_aspect_test_test1 = df_aspect_test_test1.cleaned_text

x_aspect_train_test1.shape, y_aspect_train.shape, x_aspect_val_test1.shape, y_aspect_val.shape, x_aspect_test_test1.shape, y_aspect_test.shape

((3001,), (3001, 34), (2000,), (2000, 34), (600,), (600, 34))

In [53]:
vectorizer_test1 = TfidfVectorizer()
x_aspect_train_vect = vectorizer_test1.fit_transform(x_aspect_train_test1)
x_aspect_val_vect = vectorizer_test1.transform(x_aspect_val_test1)
x_aspect_test_vect = vectorizer_test1.transform(x_aspect_test_test1)
test_text = vectorizer_test1.transform([text_preprocess(text)])

x_aspect_train_vect.shape

(3001, 5973)

In [54]:
clf_test1 = MultiOutputClassifier(LogisticRegression()).fit(x_aspect_train_vect, y_aspect_train)

In [55]:
train_res = multi_label_metrics(y_aspect_train, clf_test1.predict(x_aspect_train_vect))
val_res = multi_label_metrics(y_aspect_val, clf_test1.predict(x_aspect_val_vect))
test_res = multi_label_metrics(y_aspect_test, clf_test1.predict(x_aspect_test_vect))

print("train")
for metric, value in train_res.items():
    print(f"{metric}: {value}")

print("\n\n validation")
for metric, value in val_res.items():
    print(f"{metric}: {value}")

train
Hamming Loss: 0.06798714154105494
Accuracy: 0.15594801732755748
Precision (Micro): 0.9327658524549717
Recall (Micro): 0.541890632838816
F1 Score (Micro): 0.6855251824652069
Precision (Macro): 0.6251547623876699
Recall (Macro): 0.24398417650027207
F1 Score (Macro): 0.30736867951569463


 validation
Hamming Loss: 0.06576470588235295
Accuracy: 0.1415
Precision (Micro): 0.8440677966101695
Recall (Micro): 0.45521023765996343
F1 Score (Micro): 0.5914489311163895
Precision (Macro): 0.46312395951087926
Recall (Macro): 0.1834275017794106
F1 Score (Macro): 0.22916144316875897


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
print("test") 
for metric, value in test_res.items():
    print(f"{metric}: {value}")

test
Hamming Loss: 0.07857843137254902
Accuracy: 0.08833333333333333
Precision (Micro): 0.851109520400859
Recall (Micro): 0.4601393188854489
F1 Score (Micro): 0.597337352424014
Precision (Macro): 0.4958755062499653
Recall (Macro): 0.19622292112615086
F1 Score (Macro): 0.2435223215680511


In [57]:
print_column_names(clf_test1.predict(test_text)[0], column_names)
print("\n", correct_aspect_cat)

LOCATION#GENERAL
ROOMS#CLEANLINESS
ROOMS#DESIGN&FEATURES
SERVICE#GENERAL

 ROOMS#DESIGN&FEATURES, ROOMS#CLEANLINESS, LOCATION#GENERAL, SERVICE#GENERAL, ROOMS#GENERAL, ROOM_AMENITIES#DESIGN&FEATURES


In [58]:
print(classification_report(y_aspect_test, clf_test1.predict(x_aspect_test_vect), target_names=column_names))

                                precision    recall  f1-score   support

        FACILITIES#CLEANLINESS       0.00      0.00      0.00         5
            FACILITIES#COMFORT       0.00      0.00      0.00        26
    FACILITIES#DESIGN&FEATURES       1.00      0.12      0.22        65
            FACILITIES#GENERAL       1.00      0.05      0.09        21
      FACILITIES#MISCELLANEOUS       0.00      0.00      0.00         8
             FACILITIES#PRICES       0.00      0.00      0.00        13
            FACILITIES#QUALITY       1.00      0.08      0.15        51
     FOOD&DRINKS#MISCELLANEOUS       0.00      0.00      0.00         3
            FOOD&DRINKS#PRICES       0.00      0.00      0.00         9
           FOOD&DRINKS#QUALITY       0.84      0.71      0.77       129
     FOOD&DRINKS#STYLE&OPTIONS       0.79      0.56      0.65       124
             HOTEL#CLEANLINESS       0.92      0.16      0.28        67
                 HOTEL#COMFORT       0.62      0.32      0.42  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## test 2: use phobert instead of tfidf

## test2.1: phobert + base

In [59]:
from transformers import AutoTokenizer
PRETRAINED_MODEL = 'vinai/phobert-base'
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.max_model_input_sizes

  from .autonotebook import tqdm as notebook_tqdm


{'vinai/phobert-base': 256, 'vinai/phobert-large': 256}

In [60]:
tokens = tokenizer.encode('Tôi là sinh_viên trường đại_học Công_nghệ thông_tin .') # When use PhoBERT
print('Encode:', tokens)
print('Decode:', tokenizer.decode(tokens))

Encode: [0, 218, 8, 649, 212, 956, 2413, 195, 5, 2]
Decode: <s> Tôi là sinh_viên trường đại_học Công_nghệ thông_tin. </s>


In [61]:
x_aspect_train_vect = tokenizer.batch_encode_plus(x_aspect_train_base, pad_to_max_length=True, max_length=256, truncation=True)
x_aspect_val_vect = tokenizer.batch_encode_plus(x_aspect_val_base, pad_to_max_length=True, max_length=256, truncation=True)
x_aspect_test_vect = tokenizer.batch_encode_plus(x_aspect_test_base, pad_to_max_length=True, max_length=256, truncation=True)
test_text = tokenizer.batch_encode_plus([extract_aspect_terms(text_preprocess(text))], pad_to_max_length=True, max_length=256, truncation=True)



In [62]:
x_aspect_train_vect.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [63]:
clf_test21 = MultiOutputClassifier(LogisticRegression()).fit(x_aspect_train_vect['input_ids'], y_aspect_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [64]:
train_res = multi_label_metrics(y_aspect_train, clf_test21.predict(x_aspect_train_vect['input_ids']))
val_res = multi_label_metrics(y_aspect_val, clf_test21.predict(x_aspect_val_vect['input_ids']))
test_res = multi_label_metrics(y_aspect_test, clf_test21.predict(x_aspect_test_vect['input_ids']))

print("train")
for metric, value in train_res.items():
    print(f"{metric}: {value}")

print("\n\n validation")
for metric, value in val_res.items():
    print(f"{metric}: {value}")

train
Hamming Loss: 0.11870552952937256
Accuracy: 0.03365544818393869
Precision (Micro): 0.6469742934695832
Recall (Micro): 0.2904034974557443
F1 Score (Micro): 0.4008705975464978
Precision (Macro): 0.7023189482599674
Recall (Macro): 0.2792163874996363
F1 Score (Macro): 0.36663196156257827


 validation
Hamming Loss: 0.10442647058823529
Accuracy: 0.028
Precision (Micro): 0.5017972681524083
Recall (Micro): 0.1963155674307411
F1 Score (Micro): 0.28221975133933086
Precision (Macro): 0.1778037156766603
Recall (Macro): 0.05760101450010319
F1 Score (Macro): 0.07150117021487022


In [65]:
print("test") 
for metric, value in test_res.items():
    print(f"{metric}: {value}")

test
Hamming Loss: 0.1271078431372549
Accuracy: 0.005
Precision (Micro): 0.4957825679475164
Recall (Micro): 0.20472136222910217
F1 Score (Micro): 0.28978362092577375
Precision (Macro): 0.1955801649929933
Recall (Macro): 0.0724442084942529
F1 Score (Macro): 0.09077881192497139


In [66]:
print(classification_report(y_aspect_test, clf_test21.predict(x_aspect_test_vect['input_ids']), 
                            target_names=column_names))

                                precision    recall  f1-score   support

        FACILITIES#CLEANLINESS       0.00      0.00      0.00         5
            FACILITIES#COMFORT       0.11      0.04      0.06        26
    FACILITIES#DESIGN&FEATURES       0.34      0.15      0.21        65
            FACILITIES#GENERAL       0.00      0.00      0.00        21
      FACILITIES#MISCELLANEOUS       0.00      0.00      0.00         8
             FACILITIES#PRICES       0.00      0.00      0.00        13
            FACILITIES#QUALITY       0.08      0.02      0.03        51
     FOOD&DRINKS#MISCELLANEOUS       0.00      0.00      0.00         3
            FOOD&DRINKS#PRICES       0.29      0.22      0.25         9
           FOOD&DRINKS#QUALITY       0.20      0.05      0.08       129
     FOOD&DRINKS#STYLE&OPTIONS       0.36      0.08      0.13       124
             HOTEL#CLEANLINESS       0.00      0.00      0.00        67
                 HOTEL#COMFORT       0.16      0.10      0.12  

  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
print_column_names(clf_test21.predict(test_text['input_ids'])[0], column_names)
print("\n", correct_aspect_cat)

SERVICE#GENERAL

 ROOMS#DESIGN&FEATURES, ROOMS#CLEANLINESS, LOCATION#GENERAL, SERVICE#GENERAL, ROOMS#GENERAL, ROOM_AMENITIES#DESIGN&FEATURES


## test 2.2: phobert + test 1

In [68]:
x_aspect_train_vect = tokenizer.batch_encode_plus(x_aspect_train_test1, pad_to_max_length=True, max_length=256, truncation=True)
x_aspect_val_vect = tokenizer.batch_encode_plus(x_aspect_val_test1, pad_to_max_length=True, max_length=256, truncation=True)
x_aspect_test_vect = tokenizer.batch_encode_plus(x_aspect_test_test1, pad_to_max_length=True, max_length=256, truncation=True)
test_text = tokenizer.batch_encode_plus(text_preprocess(text), pad_to_max_length=True, max_length=256, truncation=True)



In [69]:
clf_test22 = MultiOutputClassifier(LogisticRegression()).fit(x_aspect_train_vect['input_ids'], y_aspect_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [70]:
train_res = multi_label_metrics(y_aspect_train, clf_test22.predict(x_aspect_train_vect['input_ids']))
val_res = multi_label_metrics(y_aspect_val, clf_test22.predict(x_aspect_val_vect['input_ids']))
test_res = multi_label_metrics(y_aspect_test, clf_test22.predict(x_aspect_test_vect['input_ids']))


print("train")
for metric, value in train_res.items():
    print(f"{metric}: {value}")

print("\n\n validation")
for metric, value in val_res.items():
    print(f"{metric}: {value}")

train
Hamming Loss: 0.11596134621792736
Accuracy: 0.052649116961012994
Precision (Micro): 0.6552026928142837
Recall (Micro): 0.320862896868057
F1 Score (Micro): 0.4307707110555181
Precision (Macro): 0.7177762116591748
Recall (Macro): 0.34037816168390345
F1 Score (Macro): 0.4267571083534116


 validation
Hamming Loss: 0.10764705882352942
Accuracy: 0.024
Precision (Micro): 0.46485031954254963
Recall (Micro): 0.1943467866685417
F1 Score (Micro): 0.27409758032526776
Precision (Macro): 0.17041034418872786
Recall (Macro): 0.060102436833532426
F1 Score (Macro): 0.07640368214177706


In [71]:
print("test") 
for metric, value in test_res.items():
    print(f"{metric}: {value}")

test
Hamming Loss: 0.13357843137254902
Accuracy: 0.005
Precision (Micro): 0.4354986276303751
Recall (Micro): 0.18421052631578946
F1 Score (Micro): 0.2589067174326897
Precision (Macro): 0.16426088760140697
Recall (Macro): 0.06199958692763727
F1 Score (Macro): 0.0773191976277992


In [72]:
print(classification_report(y_aspect_test, clf_test22.predict(x_aspect_test_vect['input_ids']), 
                            target_names=column_names))

                                precision    recall  f1-score   support

        FACILITIES#CLEANLINESS       0.00      0.00      0.00         5
            FACILITIES#COMFORT       0.12      0.04      0.06        26
    FACILITIES#DESIGN&FEATURES       0.24      0.09      0.13        65
            FACILITIES#GENERAL       0.00      0.00      0.00        21
      FACILITIES#MISCELLANEOUS       0.00      0.00      0.00         8
             FACILITIES#PRICES       0.11      0.08      0.09        13
            FACILITIES#QUALITY       0.12      0.02      0.03        51
     FOOD&DRINKS#MISCELLANEOUS       0.00      0.00      0.00         3
            FOOD&DRINKS#PRICES       0.11      0.11      0.11         9
           FOOD&DRINKS#QUALITY       0.19      0.03      0.05       129
     FOOD&DRINKS#STYLE&OPTIONS       0.22      0.04      0.07       124
             HOTEL#CLEANLINESS       0.00      0.00      0.00        67
                 HOTEL#COMFORT       0.11      0.09      0.10  

  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
print_column_names(clf_test21.predict(test_text['input_ids'])[0], column_names)
print("\n", correct_aspect_cat)

SERVICE#GENERAL

 ROOMS#DESIGN&FEATURES, ROOMS#CLEANLINESS, LOCATION#GENERAL, SERVICE#GENERAL, ROOMS#GENERAL, ROOM_AMENITIES#DESIGN&FEATURES


# aspect category polarity

In [74]:
x_train = df_train.cleaned_text + ' ' + df_train.aspect_cat
x_val  = df_val.cleaned_text + ' ' + df_val.aspect_cat
x_test = df_test.cleaned_text + ' ' + df_test.aspect_cat

x_train.shape, x_val.shape, x_test.shape

((13954,), (7111,), (2584,))

In [75]:
vectorizer = TfidfVectorizer()
x_train_vect = vectorizer.fit_transform(x_train)
x_val_vect = vectorizer.transform(x_val)
x_test_vect = vectorizer.transform(x_test)

x_train_vect.shape, x_val_vect.shape, x_test_vect.shape

((13954, 5987), (7111, 5987), (2584, 5987))

In [76]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [77]:
y_train = le.fit_transform(df_train.polarity)
y_val = le.transform(df_val.polarity)
y_test = le.transform(df_test.polarity)

In [78]:
le.classes_

array(['negative', 'neutral', 'positive'], dtype=object)

In [79]:
model = LogisticRegression(solver='lbfgs', multi_class='multinomial')

In [80]:
model.fit(x_train_vect, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
train_res = multi_label_metrics(y_train, model.predict(x_train_vect))
val_res = multi_label_metrics(y_val, model.predict(x_val_vect))
test_res = multi_label_metrics(y_test, model.predict(x_test_vect))

In [82]:
print("train")
for metric, value in train_res.items():
    print(f"{metric}: {value}")

print("\n\n validation")
for metric, value in val_res.items():
    print(f"{metric}: {value}")

train
Hamming Loss: 0.1411781568009173
Accuracy: 0.8588218431990827
Precision (Micro): 0.8588218431990827
Recall (Micro): 0.8588218431990827
F1 Score (Micro): 0.8588218431990827
Precision (Macro): 0.7886508131095876
Recall (Macro): 0.5635789518731895
F1 Score (Macro): 0.5783081099259779


 validation
Hamming Loss: 0.19969062016594008
Accuracy: 0.8003093798340599
Precision (Micro): 0.8003093798340599
Recall (Micro): 0.8003093798340599
F1 Score (Micro): 0.8003093798340599
Precision (Macro): 0.6693081426517143
Recall (Macro): 0.5057093111219956
F1 Score (Macro): 0.5112747761585582


In [83]:
print("\n test") 
for metric, value in test_res.items():
    print(f"{metric}: {value}")


 test
Hamming Loss: 0.21130030959752322
Accuracy: 0.7886996904024768
Precision (Micro): 0.7886996904024768
Recall (Micro): 0.7886996904024768
F1 Score (Micro): 0.7886996904024768
Precision (Macro): 0.8291635019163109
Recall (Macro): 0.5170095782090885
F1 Score (Macro): 0.5132607392780288


In [84]:
print(classification_report(y_test, model.predict(x_test_vect), target_names=le.classes_))

              precision    recall  f1-score   support

    negative       0.66      0.64      0.65       645
     neutral       1.00      0.02      0.03       133
    positive       0.83      0.90      0.86      1806

    accuracy                           0.79      2584
   macro avg       0.83      0.52      0.51      2584
weighted avg       0.80      0.79      0.77      2584



# usage

In [85]:
def get_aspect_category(binary_list, column_names):
    aspect_category = []
    for i, value in enumerate(binary_list):
        if value == 1:
            aspect_category.append(column_names[i])
    return aspect_category

def print_res(text, cat_sen):
    print(text)
    for cat, sen in cat_sen.items():
        print(f"{{{cat}, {sen}}}", end=' ')

def absa_test1(text, acd, acd_vectorizer, model, model_vectorizer):
    cleaned_text = text_preprocess(text)
    cat_sen = {}
    acd_vector = acd_vectorizer.transform([cleaned_text])
    aspect_cand = get_aspect_category(acd.predict(acd_vector)[0], column_names)
    for aspect in aspect_cand:
        sentiment = le.inverse_transform(model.predict(model_vectorizer.transform([cleaned_text + ' ' + aspect])))[0]
        cat_sen[aspect] = sentiment
        
    print_res(text, cat_sen)
    
def absa_base(text, acd, acd_vectorizer, model, model_vectorizer):
    cleaned_text = text_preprocess(text)
    aspect_text = extract_aspect_terms(cleaned_text)
    cat_sen = {}
    acd_vector = acd_vectorizer.transform([aspect_text])
    aspect_cand = get_aspect_category(acd.predict(acd_vector)[0], column_names)
    for aspect in aspect_cand:
        sentiment = le.inverse_transform(model.predict(model_vectorizer.transform([cleaned_text + ' ' + aspect])))[0]
        cat_sen[aspect] = sentiment
        
    print_res(text, cat_sen)

In [86]:
usr_rev = "Rộng rãi, sạch sẽ. Có chỗ trong phòng không bắt được wifi"

absa_test1(usr_rev, clf_test1, vectorizer_test1, model, vectorizer)

Rộng rãi, sạch sẽ. Có chỗ trong phòng không bắt được wifi
{FACILITIES#QUALITY, negative} {ROOMS#CLEANLINESS, positive} {ROOMS#DESIGN&FEATURES, positive} 

In [87]:
usr_rev = "Rộng rãi, sạch sẽ. Có chỗ trong phòng không bắt được wifi"

absa_base(usr_rev, clf_base, vectorizer_base, model, vectorizer)

Rộng rãi, sạch sẽ. Có chỗ trong phòng không bắt được wifi
{FACILITIES#QUALITY, negative} {ROOMS#CLEANLINESS, positive} {ROOMS#DESIGN&FEATURES, positive} 

In [88]:
text_preprocess(usr_rev)

'rộng_rãi sạch_sẽ có chỗ trong phòng không bắt được wifi'

In [89]:
extract_aspect_terms(text_preprocess(usr_rev))

'rộng_rãi sạch_sẽ có chỗ phòng bắt wifi'