In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Installation

## Install transformers

In [2]:
!git clone https://github.com/huggingface/transformers
!cd transformers
!pip install .

## Install fastBFE and fairseq

In [3]:
!pip install fastBPE
!pip install fairseq

## Install vncorenlp

In [4]:
# Install the vncorenlp python wrapper
!pip install vncorenlp

# Download VnCoreNLP-1.1.1.jar & its word segmentation component (i.e. RDRSegmenter) 
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

## Install pyvi

In [5]:
# install 
!pip install pyvi

## Install Phobert

In [6]:
!wget https://public.vinai.io/PhoBERT_base_fairseq.tar.gz
!tar -xzvf PhoBERT_base_fairseq.tar.gz
!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz

# Import 

In [7]:
# import
import re
import matplotlib.pyplot as plt
import seaborn as sns
from pyvi import ViTokenizer
from tqdm import tqdm
import pickle
import time

# Prepare data

## Make csv files

In [8]:
# from os import listdir
# import re
# import pandas as pd

# train_dir = './Train_Full'
# test_dir = './Test_Full'

# LABELS = ['Chinh tri Xa hoi', 'Doi song', 'Khoa hoc', 'Kinh doanh', 'Phap luat', 'Suc khoe', 'The gioi', 'The thao', 'Van hoa', 'Vi tinh']

# def create_dataset(file_name_output, directory):
#     labels = []
#     text = []
#     for label in LABELS:
#         dir_folder = '%s/%s' % (directory, label)
#         for filename in listdir(dir_folder):
#             full_filename = '%s/%s' % (dir_folder, filename)
#             with open(full_filename, "r", encoding='utf-16') as f:
#                 get_text = f.read().replace('\n', ' ')
#                 text.append(get_text)
#                 labels.append(label)

#     df = pd.DataFrame([text, labels], ['text', 'label']).T
#     df.to_csv(file_name_output, encoding='utf-8', index=False)


# # Train dataset
# create_dataset('train_data.csv', train_dir)
# # Test dataset
# create_dataset('test_data.csv', test_dir)

## Read train and test csv files

In [9]:
train_data = pd.read_csv('/kaggle/input/inputdata/train_data.csv')
test_data = pd.read_csv('/kaggle/input/inputdata/test_data.csv')

## Get stopwords to list

In [10]:
def get_stopwords_to_list(path):
    stopwords = []
    with open(path, 'r', encoding="utf-8") as f:
        stopwords_list = list(f)
        stopwords = [re.sub('\n', '', word) for word in stopwords_list]
    return stopwords

In [11]:
# get stopwords
stopwords = get_stopwords_to_list('/kaggle/input/inputdata/stopwords-nlp-vi.txt')

In [12]:
test_data

# Exploratory data analysis

## Check null

In [13]:
# check null values
print("Train data")
print(train_data.isnull().sum())
print("Test data")
print(test_data.isnull().sum())

## Count label

In [14]:
train_data['label'].value_counts()

In [15]:
test_data['label'].value_counts()

In [16]:
# count target unique
fig = plt.figure(figsize=(15, 6))
sns.countplot(train_data['label'])
plt.show()

# Funtions to processes

## Convert vietnamese word to telex type

In [17]:
# start preprocess
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"


def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic


dicchar = loaddicchar()

### Init wowel tabel

In [18]:
# woweb in vietnamese
vowel_table = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
start_word_table = ['', 'f', 's', 'r', 'x', 'j']

vowel_to_ids = {}

### Get index in word

In [19]:
for i in range(len(vowel_table)):
    for j in range(len(vowel_table[i]) - 1):
        vowel_to_ids[vowel_table[i][j]] = (i, j)

### Convert vietnamese word to telex type

In [20]:
def vn_word_to_telex_type(word):
    dau_cau = 0
    new_word = ''
    for char in word:
        x, y = vowel_to_ids.get(char, (-1, -1))
        if x == -1:
            new_word += char
            continue
        if y != 0:
            dau_cau = y
        new_word += vowel_table[x][-1]
    new_word += start_word_table[dau_cau]
    return new_word

### Convert vietnamese sentence to telex type

In [21]:
def vn_sentence_to_telex_type(sentence):
    """
    Chuyển câu tiếng việt có dấu về kiểu gõ telex.
    :param sentence:
    :return:
    """
    words = sentence.split()
    for index, word in enumerate(words):
        words[index] = vn_word_to_telex_type(word)
    return ' '.join(words)


### Standard vietnamese word punctuation

In [22]:
def standard_vietnam_word_marks(word):
    if not is_valid_vietnam_word(word):
        return word
 
    chars = list(word)
    dau_cau = 0
    vowel_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = vowel_to_ids.get(char, (-1, -1))
        if x == -1:
            continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True
        if y != 0:
            dau_cau = y
            chars[index] = vowel_table[x][0]
        if not qu_or_gi or index != 1:
            vowel_index.append(index)
    if len(vowel_index) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = vowel_to_ids.get(chars[1])
                chars[1] = vowel_table[x][dau_cau]
            else:
                x, y = vowel_to_ids.get(chars[2], (-1, -1))
                if x != -1:
                    chars[2] = vowel_table[x][dau_cau]
                else:
                    chars[1] = vowel_table[5][dau_cau] if chars[1] == 'i' else vowel_table[9][dau_cau]
            return ''.join(chars)
        return word
 
    for index in vowel_index:
        x, y = vowel_to_ids[chars[index]]
        if x == 4 or x == 8:  # ê, ơ
            chars[index] = vowel_table[x][dau_cau]
            return ''.join(chars)
 
    if len(vowel_index) == 2:
        if vowel_index[-1] == len(chars) - 1:
            x, y = vowel_to_ids[chars[vowel_index[0]]]
            chars[vowel_index[0]] = vowel_table[x][dau_cau]
        else:
            x, y = vowel_to_ids[chars[vowel_index[1]]]
            chars[vowel_index[1]] = vowel_table[x][dau_cau]
    else:
        x, y = vowel_to_ids[chars[vowel_index[1]]]
        chars[vowel_index[1]] = vowel_table[x][dau_cau]
    return ''.join(chars)

### Check valid vietnamese word

In [23]:
def is_valid_vietnam_word(word):
    chars = list(word)
    vowel_index = -1
    for index, char in enumerate(chars):
        x, y = vowel_to_ids.get(char, (-1, -1))
        if x != -1:
            if vowel_index == -1:
                vowel_index = index
            else:
                if index - vowel_index != 1:
                    return False
                vowel_index = index
    return True

### Standard vietnamese sentence punctuation

In [24]:
def standard_vietnam_sentence_marks(sentence):
    sentence = sentence.lower()
    words = sentence.split()
    for index, word in enumerate(words):
        cw = re.sub(r'(^\\p{P}*)([p{L}.]*\\p{L}+)(\\p{P}*$)', r'\1/\2/\3', word).split('/')
        # print(cw)
        if len(cw) == 3:
            cw[1] = standard_vietnam_word_marks(cw[1])
        words[index] = ''.join(cw)
    return ' '.join(words)

### Convert normal unicode to standard unicode

In [25]:
def convert_unicode(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

### Remove loop char

In [26]:
def remove_loop_char(txt):
    txt = re.sub(r'([A-Z])\1+', lambda m: m.group(1).upper(), str(txt), flags=re.IGNORECASE)
    txt = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]',' ',txt)
    return txt

### Remove stopwords

In [27]:
def remove_stopwords(txt):
    split_words = txt.split()
    words = []
    for char in split_words:
        if char not in stopwords:
            words.append(char)
            
    return " ".join(words)

### Load VnCoreNLP

In [28]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("/kaggle/working/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 
# rdrsegmenter = VnCoreNLP("/content/drive/My Drive/BERT/SA/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

text = "Đại học Bách Khoa Hà Nội."

word_segmented_text = rdrsegmenter.tokenize(text) 
print(word_segmented_text)

### Tokenizer

In [37]:
# tokenizer with pyvi
def tokenize_by_vi(txt):
    txt = ViTokenizer.tokenize(txt)
    return txt

# tokenizer with vncorenlp
def tokenize_by_vn(txt):
    txt = rdrsegmenter.tokenize(txt)
    txt = ' '.join([' '.join(x) for x in txt])
    return txt

### Remove number

In [30]:
def remove_number(txt):
    txt= re.sub(r'\d+',"",txt)
    return txt

### Remove single char and space

In [31]:
def remove_single_char_and_space(txt):
    # remove all single characters
    txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
    
    # Remove single characters from the start
    txt = re.sub(r'\^[a-zA-Z]\s+', ' ', txt) 
    
    # remove space
    txt = re.sub(r'\s+', ' ', txt)
    return txt

### Convert to lowercase

In [32]:
def convert_to_lowercase(txt):
    txt = txt.lower()
    return txt

### Clean data

In [34]:
def clean_data(txt):
    # convert txt to lowercase
    txt = convert_to_lowercase(txt)
    # remove number
    txt = remove_number(txt)
    # remove single char and space
    txt = remove_single_char_and_space(txt)
    # convert txt from unicode 
    txt = convert_unicode(txt)
    # standard vietnam sentence marks
    txt = standard_vietnam_sentence_marks(txt)
    # remove loop char
    txt = remove_loop_char(txt)
    # remove stopwords
    txt = remove_stopwords(txt)
    return txt
    

# Preprocess train and test data

In [35]:
# preprocess
tqdm.pandas()
train_data['preprocess_text'] = train_data['text'].progress_apply(lambda s : clean_data(s))
tqdm.pandas()
test_data['preprocess_text'] = test_data['text'].progress_apply(lambda s : clean_data(s))

### Tokenize by vi

In [38]:
# tokenizer with pyvi
tqdm.pandas()
train_data['preprocess_text_by_vi'] = train_data['preprocess_text'].progress_apply(lambda s : tokenize_by_vi(s))
tqdm.pandas()
test_data['preprocess_text_by_vi'] = test_data['preprocess_text'].progress_apply(lambda s : tokenize_by_vi(s))

### Tokenize by vn

In [39]:
# tokenizer with vncorenlp
tqdm.pandas()
train_data['preprocess_text_by_vn'] = train_data['preprocess_text'].progress_apply(lambda s : tokenize_by_vn(s))
tqdm.pandas()
test_data['preprocess_text_by_vn'] = test_data['preprocess_text'].progress_apply(lambda s : tokenize_by_vn(s))

In [40]:
train_data

# Data after preprocess

## Save data

In [41]:
train_data.to_csv('train_data_clean.csv')
test_data.to_csv('test_data_clean.csv')

## Read data

In [None]:
# train_data = pd.read_csv('/kaggle/input/inputdata/train_data_clean.csv')
# test_data = pd.read_csv('/kaggle/input/inputdata/test_data_clean.csv')

## Shuffle data

In [42]:
from sklearn.utils import shuffle
train_data = shuffle(train_data)
test_data = shuffle(test_data)

In [43]:
test_data

# Train model

## Vectorizer data

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score

count_vect = TfidfVectorizer()
X_train = count_vect.fit_transform(train_data['preprocess_text_by_vi'])
X_test = count_vect.transform(test_data['preprocess_text_by_vi'])

## Save Tfidf model

In [None]:
with open('tfidf.pkl', 'wb') as file:
    pickle.dump(count_vect, file)

## SVM model

In [45]:
from sklearn.svm import SVC

# Train
print("Training...")
start_time = time.time()
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, train_data['label'])
train_time = time.time() - start_time
print(f'SVM train in {train_time} seconds')

# Predict
print("Predicting...")
y_pred = svm_model.predict(X_test)

# Accuracy
svm_acc = accuracy_score(test_data['label'], y_pred)
print(f'Accuracy is {svm_acc}')
# 0.9210886784586981

### Classification Report in SVM

In [46]:
from sklearn.metrics import classification_report
print(classification_report(test_data['label'], y_pred, target_names=list(test_data['label'].unique())))

### Save SVM model

In [None]:
# save model
with open('svm_model.pkl', 'wb') as file:
    pickle.dump(svm_model, file)

## MultinomialNB model

In [47]:
from sklearn.naive_bayes import MultinomialNB

# Train
print("Training...")
start_time = time.time()
mul_model = MultinomialNB()
mul_model.fit(X_train, train_data['label'])
train_time = time.time() - start_time
print(f'Naive Bayes train in {train_time} seconds')

# Predict
print("Predicting...")
y_mul_pred = mul_model.predict(X_test)

# Accuracy
mul_acc = accuracy_score(test_data['label'], y_mul_pred)
print(f'Accuarcy is {mul_acc}')
# 0.812677426399063

### Classification Report in Naive Bayes

In [48]:
from sklearn.metrics import classification_report
print(classification_report(test_data['label'], y_mul_pred, target_names=list(test_data['label'].unique())))

## Save MultinomiaNB Model

In [None]:
# save model
with open('multinomiaNB.pkl', 'wb') as file:
    pickle.dump(mul_model, file)

## DecisionTree model

### Classification Report in Decision Tree

In [49]:
from sklearn.tree import DecisionTreeClassifier

# Train
print("Training...")
start_time = time.time()
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, train_data['label'])
train_time = time.time() - start_time
print(f'Decision Tree train in {train_time} seconds')

# Predict
print("Predicting...")
y_clf_pred = clf.predict(X_test)

# Accuracy
clf_acc = accuracy_score(test_data['label'], y_clf_pred)
print(f'Accuracy is {clf_acc}')
# 0.723026224366228

In [50]:
from sklearn.metrics import classification_report
print(classification_report(test_data['label'], y_clf_pred, target_names=list(test_data['label'].unique())))

### Save Decision Tree model

In [None]:
# save model
with open('decision_tree.pkl', 'wb') as file:
    pickle.dump(clf, file)

## Logistic Regression model

In [51]:
from sklearn.linear_model import LogisticRegression

# Train
print("Training...")
start_time = time.time()
scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, 
                                    C=5, penalty='l2',max_iter=1000)
scikit_log_reg.fit(X_train, train_data['label'])
train_time = time.time() - start_time
print(f'Logistic Regression train in {train_time} seconds')

# Predict
print("Predicting...")
y_log_reg_pred = scikit_log_reg.predict(X_test)

# Accuracy
scikit_log_reg_acc = accuracy_score(test_data['label'], y_log_reg_pred)
print(f'Accuracy is {scikit_log_reg_acc}')
# 0.9220217179838406

### Classification Report in Logistic Regression

In [52]:
from sklearn.metrics import classification_report
print(classification_report(test_data['label'], y_log_reg_pred, target_names=list(test_data['label'].unique())))

### Save Logistic Regression Model

In [None]:
# save model
with open('logistic_regression.pkl', 'wb') as file:
    pickle.dump(scikit_log_reg, file)

## PhoBert Model

### Load PhoBERT model and bpe

In [53]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="/kaggle/working/PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("/kaggle/working/PhoBERT_base_transformers/dict.txt")

In [54]:
bpe.encode('Hôm_nay trời nóng quá nên tôi ở nhà viết Viblo!')
vocab.encode_line('<s> ' + 'Hôm_nay trời nóng quá nên tôi ở nhà viết Vi@@ blo@@ !' + ' </s>')

### Encode Labels

In [55]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
lb.fit(train_data['label'])
train_labels_lb = lb.fit_transform(train_data['label'])
test_labels_lb = lb.fit_transform(test_data['label'])
print(lb.classes_)
print('Top 5 classes indices: ', train_labels_lb)
print('Top 5 classes indices: ', test_labels_lb)

In [56]:
train_data['label_encode'] = train_labels_lb
test_data['label_encode'] = test_labels_lb

In [None]:
test_data

### Split data train and test

In [57]:
train_sents, train_labels = train_data['preprocess_text_by_vn'], train_data['label_encode'].to_list()
val_sents, val_labels = test_data['preprocess_text_by_vn'], test_data['label_encode'].to_list()

### Get train and test ids

In [58]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 256

train_ids = []
for sent in train_sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    train_ids.append(encoded_sent)

val_ids = []
for sent in val_sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    val_ids.append(encoded_sent)
    
train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
val_ids = pad_sequences(val_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

In [None]:
train_ids

### Get train and test marks

In [59]:
train_masks = []
for sent in train_ids:
    mask = [int(token_id > 0) for token_id in sent]
    train_masks.append(mask)

val_masks = []
for sent in val_ids:
    mask = [int(token_id > 0) for token_id in sent]
    val_masks.append(mask)

In [61]:
# train_masks

### Get DataLoader

In [62]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

train_inputs = torch.tensor(train_ids)
val_inputs = torch.tensor(val_ids)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

### Load and config model

In [63]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW

config = RobertaConfig.from_pretrained(
    "/kaggle/working/PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 10, output_hidden_states=False,
)
BERT_SA = RobertaForSequenceClassification.from_pretrained(
    "/kaggle/working/PhoBERT_base_transformers/model.bin",
    config=config
)
BERT_SA.cuda()

### Function get accuracy

In [64]:
def flat_accuracy(logits, labels):
    preds = []
    for line in logits:
        preds.append(torch.argmax(line).cpu().numpy())
    return accuracy_score(preds, labels.cpu().numpy())

### Function train model

In [65]:
def train_model(model):
    model.train()
    train_loss = 0
    acc = []
    for batch in train_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        BERT_SA.zero_grad()
        outputs = model(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels)
        
        loss = outputs[0]
        logits = outputs[1]
        acc.append(flat_accuracy(logits, b_labels))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()

    return train_loss, round(np.mean(acc), 3)

### Training

In [66]:
import random
from tqdm import tqdm_notebook
device = 'cuda'
epochs = 10
import time

param_optimizer = list(BERT_SA.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)
train_loss_ = []
test_loss_ = []

print("Training start!")
for epoch in range(0, epochs):
    begin_time = time.time()
    loss, acc = train_model(BERT_SA)
    done_time = time.time() - begin_time
    print(f"Epoch {epoch + 1}: {round(done_time)} seconds - loss {round(loss, 3)} - accuracy {acc}")
    train_loss_.append([loss, acc])
    
print("Training complete!")

### Predicting

In [67]:
preds = []
for batch in val_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    with torch.no_grad():
        pred = BERT_SA(b_input_ids,token_type_ids=None, attention_mask=b_input_mask)
        preds.append(pred[0])

In [68]:
lab = []
for line in preds:
    for i in line:
        predicts = i.to('cpu').detach().numpy().copy()
        lab.append(np.argmax(predicts))

In [69]:
print(f'Accuracy BERT is {accuracy_score(val_labels.tolist(), lab)}')

### Save model

In [None]:
BERT_SA.save_pretrained("model_bert")

### Remove cache

In [70]:
import gc

gc.collect()
torch.cuda.empty_cache()
del BERT_SA