In [3]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import regex as re

## Chuẩn hoá bảng mã tiếng việt

In [4]:
def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic

In [5]:
dicchar = loaddicchar()
def convert_unicode(txt):
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

## Preprocessing Data

In [6]:
from underthesea import word_tokenize

In [7]:
def text_preprocess(data):
    # chuẩn hoá unicode
    data = convert_unicode(data)
    # tách từ
    data = word_tokenize(data, format="text")
    # xoá các kí tự "," "."  
    data = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]',' ',data)
    # xoá khoảng trắng
    data = re.sub(r'\s+', ' ', data).strip()
    return data
    

## Chuẩn bị Data

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Preprocessing Label in Trainning data

In [9]:
file1 = open('trainning.txt', 'r') 


text = []
label = []
train_div = 0.8  # 8 phan cho train - 2 phan cho test
Lines = file1.readlines()
# count=0
# Xu li phan label cho data
for line in Lines:
    words = text_preprocess(line).strip().split()
    while words[0][-1] != "_":
        words[0]=words[0][:-1:]
    if words[0][-3] == "_":
        words[0]=words[0][:-1:]
    label.append(words[0])
    text.append(' '.join(words[1:]))


In [10]:
x_train, x_test, y_train, y_test = train_test_split(text, label, test_size=train_div, random_state=21)
df= pd.Series(label)
df.value_counts()


__KH__      500
__CTXH__    500
__SK__      500
__PL__      500
__KD__      500
__DS__      500
__TT__      500
__VT__      500
__VH__      500
__TG__      500
dtype: int64

## Encode label

In [11]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print(list(label_encoder.classes_), '\n')

['__CTXH__', '__DS__', '__KD__', '__KH__', '__PL__', '__SK__', '__TG__', '__TT__', '__VH__', '__VT__'] 



In [12]:
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
 

In [19]:
y_train

array([0, 3, 7, 1, 3, 0, 3, 7, 6, 9, 9, 6, 0, 6, 7, 8, 1, 1, 2, 8, 4, 4,
       2, 1, 5, 0, 4, 9, 8, 6, 4, 0, 9, 1, 2, 2, 2, 1, 7, 9, 4, 4, 9, 9,
       0, 3, 3, 1, 2, 1, 1, 2, 3, 1, 8, 9, 7, 4, 0, 0, 7, 6, 0, 9, 6, 7,
       1, 4, 8, 1, 1, 5, 6, 3, 6, 1, 3, 8, 9, 2, 2, 2, 5, 5, 8, 5, 9, 5,
       8, 4, 0, 8, 4, 9, 4, 0, 3, 8, 5, 3, 0, 1, 8, 6, 0, 3, 1, 5, 9, 4,
       1, 8, 0, 2, 3, 3, 7, 2, 7, 1, 3, 4, 5, 7, 1, 9, 6, 7, 8, 6, 5, 7,
       0, 1, 5, 0, 8, 7, 3, 8, 6, 4, 9, 5, 0, 0, 7, 1, 9, 7, 8, 7, 8, 2,
       0, 8, 5, 7, 9, 2, 9, 4, 6, 3, 8, 1, 7, 3, 3, 7, 5, 1, 5, 6, 9, 8,
       9, 7, 5, 1, 7, 6, 5, 4, 6, 7, 5, 9, 4, 8, 5, 2, 4, 0, 5, 4, 9, 7,
       8, 5, 7, 1, 3, 6, 5, 1, 6, 9, 9, 8, 3, 4, 0, 8, 3, 6, 7, 7, 0, 1,
       0, 7, 4, 8, 3, 7, 4, 0, 2, 8, 3, 0, 2, 4, 9, 5, 4, 6, 4, 8, 5, 6,
       6, 3, 2, 9, 2, 1, 7, 2, 5, 8, 2, 3, 3, 0, 8, 9, 3, 8, 1, 3, 7, 2,
       8, 3, 8, 7, 1, 9, 5, 4, 3, 0, 6, 5, 8, 5, 1, 4, 0, 2, 7, 3, 5, 7,
       5, 0, 2, 3, 8, 1, 8, 5, 2, 7, 4, 2, 9, 4, 5,

## SVM


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC

In [15]:
clf = Pipeline([('vect', CountVectorizer()), # vector hoá từ 
                     ('tfidf', TfidfTransformer()), #Tf-idf từ
                     ('clf-svm', SVC()) # SVC
                    ])

In [20]:
clf = clf.fit(x_train,y_train)

In [25]:
y_pre=clf.predict(x_test)

In [23]:
y_test

array([4, 0, 2, ..., 9, 8, 7])

In [24]:
from sklearn.metrics import accuracy_score

In [26]:
accuracy_score(y_pre,y_test)

0.8195

In [27]:
file2 = open('testing.txt','r')


text2 = []
Lines = file1.readlines()
Lines2 = file2.readlines()
# count=0
# Xu li phan label cho data
for line in Lines2:
    words = text_preprocess(line).strip().split()
    text2.append(' '.join(words[1:]))

In [28]:
y_predict = clf.predict(text2)

In [29]:
y_predict

array([2, 1, 4, ..., 5, 0, 0])

In [30]:
y_label_pre = label_encoder.inverse_transform(y_predict)

In [31]:
y_label_pre

array(['__KD__', '__DS__', '__PL__', ..., '__SK__', '__CTXH__',
       '__CTXH__'], dtype='<U8')

In [34]:
file3 = open('predicts.txt', 'w') 
for label in y_label_pre :
    file3.write(label)
    file3.write("\n")

In [35]:
from sklearn.metrics import confusion_matrix


In [36]:
confusion_matrix(y_test,y_pre)

array([[329,  13,  18,   5,   9,   9,   1,   0,   9,   1],
       [ 31, 314,   4,  13,   0,  13,   0,   3,  16,   0],
       [ 54,   2, 324,   5,   1,   0,   3,   0,   0,  12],
       [ 35,  27,   0, 316,   0,  16,   2,   0,   1,   4],
       [ 69,   9,   4,   0, 312,   2,   2,   1,   2,   1],
       [ 15,   6,   5,  10,   0, 356,   0,   0,   0,   1],
       [ 34,   5,  10,  12,   2,  10, 331,   0,   1,   1],
       [ 27,   2,   2,   0,   2,   2,   2, 353,   3,   1],
       [ 32,  13,   1,   4,   1,   0,   3,   0, 342,   0],
       [ 59,   8,  18,  23,   1,   1,   3,   0,   5, 301]])

## Decision Tree