In [16]:
import sys
sys.path.append('.')

In [17]:
import pandas as pd
import re
import numpy as np
import nltk
import emoji
import collections
from bpe import Encoder
import gensim
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
import math
import json
import os
from underthesea import word_tokenize
from emoticons import EMOTICONS

In [2]:
def no_accent_vietnamese(s):
    s = re.sub('[√°√†·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠]', 'a', s)
    s = re.sub('[√Å√Ä·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨]', 'A', s)
    s = re.sub('[√©√®·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá]', 'e', s)
    s = re.sub('[√â√à·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ]', 'E', s)
    s = re.sub('[√≥√≤·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£]', 'o', s)
    s = re.sub('[√ì√í·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢]', 'O', s)
    s = re.sub('[√≠√¨·ªâƒ©·ªã]', 'i', s)
    s = re.sub('[√ç√å·ªàƒ®·ªä]', 'I', s)
    s = re.sub('[√∫√π·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]', 'u', s)
    s = re.sub('[√ö√ô·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞]', 'U', s)
    s = re.sub('[√Ω·ª≥·ª∑·ªπ·ªµ]', 'y', s)
    s = re.sub('[√ù·ª≤·ª∂·ª∏·ª¥]', 'Y', s)
    s = re.sub('ƒë', 'd', s)
    s = re.sub('ƒê', 'D', s)
    return s

def find_accent_vietnamese(line):
    if re.findall('[√°√†·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠]', line) != []:
        return True
    elif re.findall('[√Å√Ä·∫¢√É·∫†ƒÇ·∫Æ·∫∞·∫≤·∫¥·∫∂√Ç·∫§·∫¶·∫®·∫™·∫¨]', line) != []:
        return True
    elif re.findall('[√©√®·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá]', line) != []:
        return True
    elif re.findall('[√â√à·∫∫·∫º·∫∏√ä·∫æ·ªÄ·ªÇ·ªÑ·ªÜ]', line) != []:
        return True
    elif re.findall('[√≥√≤·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£]', line) != []:
        return True
    elif re.findall('[√ì√í·ªé√ï·ªå√î·ªê·ªí·ªî·ªñ·ªò∆†·ªö·ªú·ªû·ª†·ª¢]', line) != []:
        return True
    elif re.findall('[√≠√¨·ªâƒ©·ªã]', line) != []:
        return True
    elif re.findall('[√ç√å·ªàƒ®·ªä]', line) != []:
        return True
    elif re.findall('[√∫√π·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±]', line) != []:
        return True
    elif re.findall('[√ö√ô·ª¶≈®·ª§∆Ø·ª®·ª™·ª¨·ªÆ·ª∞]', line) != []:
        return True
    elif re.findall('[√Ω·ª≥·ª∑·ªπ·ªµ]', line) != []:
        return True
    elif re.findall('[√ù·ª≤·ª∂·ª∏·ª¥]', line) != []:
        return True
    elif re.findall('ƒë', line) != []:
        return True
    elif re.findall('ƒê', line) != []:
        return True
    return False

def reformat_unicode(line):
    line = re.sub('aÃÄ', '√†', line) # a\xcc\x80 -> \xc3\xa0
    line = re.sub('ƒÉÃÄ', '·∫±', line) # \xc4\x83\xcc\x80 -> \xe1\xba\xb1
    line = re.sub('√¢ÃÄ', '·∫ß', line) # \xc3\xa2\xcc\x80 -> \xe1\xba\xa7
    line = re.sub('eÃÄ', '√®', line) # e\xcc\x80 -> \xe1\xba\xb9
    line = re.sub('√™ÃÄ', '·ªÅ', line) # \xc3\xaa\xcc\x80 -> \xe1\xbb\x81
    line = re.sub('oÃÄ', '√≤', line) # o\xcc\x80 -> \xc3\xb2
    line = re.sub('√¥ÃÄ', '·ªì', line) # \xc3\xb4\xcc\x80 -> \xe1\xbb\x93
    line = re.sub('∆°ÃÄ', '·ªù', line) # \xc6\xa1\xcc\x80 -> \xe1\xbb\x9d
    line = re.sub('iÃÄ', '√¨', line) # i\xcc\x80 -> \xc3\xac
    line = re.sub('uÃÄ', '√π', line) # u\xcc\x80 -> \xc3\xb9
    line = re.sub('∆∞ÃÄ', '·ª´', line) # \xc6\xb0\xcc\x80 -> \xe1\xbb\xab
    line = re.sub('yÃÄ', '·ª≥', line) # y\xcc\x80 -> \xe1\xbb\xb3
    
    line = re.sub('aÃÅ', '√°', line) # a\xcc\x81 -> \xc3\xa1
    line = re.sub('ƒÉÃÅ', '·∫Ø', line) # \xc4\x83\xcc\x81 -> \xe1\xba\xaf
    line = re.sub('√¢ÃÅ', '·∫•', line) # \xc3\xa2\xcc\x81 -> \xe1\xba\xa5
    line = re.sub('eÃÅ', '√©', line) # e\xcc\x81 -> \xc3\xa9
    line = re.sub('√™ÃÅ', '·∫ø', line) # \xc3\xaa\xcc\x81 -> \xe1\xba\xbf
    line = re.sub('oÃÅ', '√≥', line) # o\xcc\x81 -> \xc3\xb3
    line = re.sub('√¥ÃÅ', '·ªë', line) # \xc3\xb4\xcc\x81 -> \xe1\xbb\x91
    line = re.sub('∆°ÃÅ', '·ªõ', line) # \xc6\xa1\xcc\x81 -> \xe1\xbb\x9b
    line = re.sub('iÃÅ', '√≠', line) # i\xcc\x81 -> \xc3\xad
    line = re.sub('uÃÅ', '√∫', line) # u\xcc\x81 -> \xc3\xba
    line = re.sub('∆∞ÃÅ', '·ª©', line) # \xc6\xb0\xcc\x81 -> \xe1\xbb\xa9
    line = re.sub('yÃÅ', '√Ω', line) # y\xcc\x81 -> \xc3\xbd
    
    line = re.sub('aÃâ', '·∫£', line) # a\xcc\x89 -> \xe1\xba\xa3
    line = re.sub('ƒÉÃâ', '·∫≥', line) # \xc4\x83\xcc\x89 -> \xe1\xba\xb3
    line = re.sub('√¢Ãâ', '·∫©', line) # \xc3\xa2\xcc\x89 -> \xe1\xba\xa9
    line = re.sub('eÃâ', '·∫ª', line) # e\xcc\x89 -> \xe1\xba\xbb
    line = re.sub('√™Ãâ', '·ªÉ', line) # \xc3\xaa\xcc\x89 -> \xe1\xbb\x83
    line = re.sub('oÃâ', '·ªè', line) # o\xcc\x89 -> \xe1\xbb\x8f
    line = re.sub('√¥Ãâ', '·ªï', line) # \xc3\xb4\xcc\x89 -> \xe1\xbb\x95
    line = re.sub('∆°Ãâ', '·ªü', line) # \xc6\xa1\xcc\x89 -> \xe1\xbb\x9f
    line = re.sub('iÃâ', '·ªâ', line) # i\xcc\x89 -> \xe1\xbb\x89
    line = re.sub('uÃâ', '·ªß', line) # u\xcc\x89 -> \xe1\xbb\xa7
    line = re.sub('∆∞Ãâ', '·ª≠', line) # \xc6\xb0\xcc\x89 -> \xe1\xbb\xad
    line = re.sub('yÃâ', '·ª∑', line) # y\xcc\x89 -> \xe1\xbb\xb7
    
    line = re.sub('aÃÉ', '√£', line) # a\xcc\x83 -> \xc3\xa3
    line = re.sub('ƒÉÃÉ', '·∫µ', line) # \xc4\x83\xcc\x83 -> \xe1\xba\xb5
    line = re.sub('√¢ÃÉ', '·∫´', line) # \xc3\xa2\xcc\x83 -> \xe1\xba\xab
    line = re.sub('eÃÉ', '·∫Ω', line) # e\xcc\x83 -> \xe1\xba\xbd
    line = re.sub('√™ÃÉ', '·ªÖ', line) # \xc3\xaa\xcc\x83 -> \xe1\xbb\x85
    line = re.sub('oÃÉ', '√µ', line) # o\xcc\x83 -> \xc3\xb5
    line = re.sub('√¥ÃÉ', '·ªó', line) # \xc3\xb4\xcc\x83 -> \xe1\xbb\x97
    line = re.sub('∆°ÃÉ', '·ª°', line) # \xc6\xa1\xcc\x83 -> \xe1\xbb\xa1
    line = re.sub('iÃÉ', 'ƒ©', line) # i\xcc\x83 -> \xc4\xa9
    line = re.sub('uÃÉ', '≈©', line) # u\xcc\x83 -> \xc5\xa9
    line = re.sub('∆∞ÃÉ', '·ªØ', line) # \xc6\xb0\xcc\x83 -> \xe1\xba\xb5
    line = re.sub('yÃÉ', '·ªπ', line) # y\xcc\x83 -> \xe1\xbb\xb9
    
    line = re.sub('aÃ£', '·∫°', line) # a\xcc\xa3 -> \xe1\xba\xa1
    line = re.sub('ƒÉÃ£', '·∫∑', line) # \xc4\x83\xcc\xa3 -> \xe1\xba\xb7
    line = re.sub('√¢Ã£', '·∫≠', line) # \xc3\xa2\xcc\xa3 -> \xe1\xba\xad
    line = re.sub('eÃ£', '·∫π', line) # e\xcc\xa3 -> \xe1\xba\xb9
    line = re.sub('√™Ã£', '·ªá', line) # \xc3\xaa\xcc\xa3 -> \xe1\xbb\x87
    line = re.sub('oÃ£', '·ªç', line) # o\xcc\xa3 -> \xe1\xbb\x8d
    line = re.sub('√¥Ã£', '·ªô', line) # \xc3\xb4\xcc\xa3 -> \xe1\xbb\x99
    line = re.sub('∆°Ã£', '·ª£', line) # \xc6\xa1\xcc\xa3 -> \xe1\xbb\xa3
    line = re.sub('iÃ£', '·ªã', line) # i\xcc\xa3 -> \xe1\xbb\x8b
    line = re.sub('uÃ£', '·ª•', line) # u\xcc\xa3 -> \xe1\xbb\xa5
    line = re.sub('∆∞Ã£', '·ª±', line) # \xc6\xb0\xcc\xa3 -> \xe1\xbb\xb1
    line = re.sub('yÃ£', '·ªµ', line) # y\xcc\xa3 -> \xe1\xbb\xb5
    
    return line



In [13]:
PUNCT_CHAR = r'([!‚Äù#$%&‚Äô()*+,-./:;<=>?@[\]^_`{|}~])' # r'["\'./,#$%&~{|}[\]`+]'
punct = re.compile(PUNCT_CHAR)

DIGIT_WITH_CHAR = r'\d+'  # r'([a-zA-Z]*)(\d+)([a-zA-A]*)'
digit = re.compile(DIGIT_WITH_CHAR)

URL = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?¬´¬ª‚Äú‚Äù‚Äò‚Äô]))'''
url = re.compile(URL)

def lowercase(data):
    return [line.lower() for line in data]

def remove_punct_char(data):
    return [punct.sub(' ', line) for line in data] 

def remove_accent(data):
    return [no_accent_vietnamese(line) for line in data]

def replace_emoticons(line):
    for reg, meanings in EMOTICONS.items():
        line = re.sub(reg, meanings, line)
    return line

def reformat_text(data):
    data = [reformat_unicode(line) for line in data]
    return [replace_emoticons(line) for line in data]

def remove_url(data):
    return [url.sub('', line) for line in data]

def remove_digit_char(data):
    return [digit.sub('', line) for line in data]

def strip_duplicate_char(word):
    if len(word) == 1:
        return word
    # Strip duplicate char at the end
    last_word = word[-1]
    word = word.rstrip(f'{last_word}')
    word = word + last_word
    # Strip duplicate char at the begining
    first_word = word[0]
    word = word.lstrip(f'{first_word}')
    word = first_word + word
    return word

def strip_head_tail(line):
    line = line.strip()
    return ' '.join([strip_duplicate_char(word) for word in line.split()])

def strip(data):
    return [strip_head_tail(line) for line in data]

def split_emoji(line):
    split_text = emoji.get_emoji_regexp().split(line)
    return [i for i in split_text if i != '' and i != 'Ô∏è'] # <= the second is for red-heart emoji


def remove_telex_error_in_word(word):
    word = re.sub('[wfjzx]', '', word)
    if re.findall('[a-z]s', word):
        word = re.sub('s', '', word)
    if re.findall('[a-z]x', word):
        word = re.sub('x', '', word)
    return word

def remove_telex_error_in_line(line):
    return ' '.join([remove_telex_error_in_word(word) for word in line.split()])

def remove_telex_error(data):
    return [remove_telex_error_in_line(line) for line in data]

In [3]:
def get_vocab(data):
    vocab = collections.defaultdict(int)
    data = [split_emoji(i) for i in data]
    for sent_emoji in data:
        for each in sent_emoji:
            for word in each.split():
                vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

def add_end_token(line):
    line = split_emoji(line)
    new_arr = []
    for each in line:
        for word in each.split():
            new_arr.append(word + '</w>')
    return ' '.join([word for word in new_arr])

def get_stats(vocab):
        pairs = collections.defaultdict(int)
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols)-1):
                pairs[symbols[i],symbols[i+1]] += freq
        return pairs
    
def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def get_tokens_from_vocab(vocab):
    tokens_frequencies = collections.defaultdict(int)
    vocab_tokenization = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens_frequencies[token] += freq
        vocab_tokenization[''.join(word_tokens)] = word_tokens
    return tokens_frequencies, vocab_tokenization

def get_tokens(vocab):
    tokens = collections.defaultdict(int)
    for word, freq in vocab.items():
        word_tokens = word.split(' ')
        for token in word_tokens:
            tokens[token] += freq
    return tokens

def measure_token_length(token):
    if token[-4:] == '</w>':
        return len(token[:-4]) + 1
    else:
        return len(token)

class BPETokenizer():
    def __init__(self, number_merge=1000, unknown_token='</u>'):
        self.number_merge = number_merge
        self.unknown_token = unknown_token
        
    def fit(self, data):
        vocab = get_vocab(data)
        for _ in range(self.number_merge):
            pairs = get_stats(vocab)
            if not pairs:
                break
            best_pair = max(pairs, key=pairs.get)
            vocab = merge_vocab(best_pair, vocab)
        self.tokens_frequencies, self.vocab_tokenization = get_tokens_from_vocab(vocab)
        self.sorted_tokens_tuple = sorted(self.tokens_frequencies.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
        self.sorted_tokens = [token for (token, freq) in self.sorted_tokens_tuple]
        
    def tokenize(self, string, sorted_tokens):
        if string == '':
            return []
        if self.sorted_tokens == []:
            return [self.unknown_token]
        
        string_tokens = []
        is_tokenized = False
        for i in range(len(sorted_tokens)):
            token = sorted_tokens[i]
            token_reg = re.escape(token.replace('.', '[.]'))

            matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
            if len(matched_positions) == 0:
                continue

            substring_end_positions = [matched_position[0] for matched_position in matched_positions]
            
            substring_start_position = 0
            for substring_end_position in substring_end_positions:
                substring = string[substring_start_position:substring_end_position]
                string_tokens += self.tokenize(string=substring, sorted_tokens=sorted_tokens[i+1:])
                string_tokens += [token]
                substring_start_position = substring_end_position + len(token)
            remaining_substring = string[substring_start_position:]
            string_tokens += self.tokenize(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:])
            break
            
        return string_tokens
    
    
    def tokenize_word(self, string):
        string = add_end_token(string)
        return self.tokenize(string, self.sorted_tokens)




In [4]:
def label(star):
    label = []
    for i in star:
        if i == 5 or i == 4:
            label.append('POS')
        elif i == 3:
            label.append('NEU')
        else:
            label.append('NEG')
    return label

def add_no_accent(data, label=None, random_state=52):
    no_accent = remove_accent(data)
    for i, sent in enumerate(data):
        if find_accent_vietnamese(sent):
            no_accent += [sent]
            if label:
                label += [label[i]]
    new_data = np.array(no_accent)
    new_label = np.array(label)
    
    np.random.seed(random_state)
    permutation = np.random.permutation(len(no_accent))
    new_data = new_data[permutation]
    
    if label:
        new_label = new_label[permutation]
        return new_data.tolist(), new_label.tolist()
    else:
        return new_data.tolist()

def remove_stopwords(data, stopwords):
    new_data = []
    for sent in data:
        sent = word_tokenize(sent, format='text')
        sent = [word for word in sent.split() if word not in stopwords]
        new_data.append(re.sub('_', ' ', ' '.join(sent)))
        break

    return new_data
    
def drop_uncommon_character(data, exception):
    new_data = []
    for sent in data:
        sent = [word for word in sent if (len(word) > 1 or 
                (word in exception) or
                (emoji.get_emoji_regexp().findall(word) != [])) and
                digit.findall(word) == [] ]
        new_data.append(sent)
    return new_data

In [5]:
def tokenize(data, encoder):
    data = [encoder.tokenize(i) for i in data]
    return remove_eow_sow(data)
    
def remove_eow_sow(data):
    new_data = []
    for sent in data:
        sent = [word for word in sent if word not in ['__eow', '__sow']]
        new_data.append(sent)
    return new_data

def add_padding(data, max_length=64, padding='__pad'):
    new_data = []
    for sent in data:
        if len(sent) > max_length:
            sent = sent[:max_length]
        else:
            sent += [padding]*(max_length - len(sent))
        new_data.append(sent)
    
    return new_data

def word2vec_embedding(data, model):
    new_data = []
    for sent in data:
        sent_len = len(sent)
        if sent_len == 0:
            word_vec = np.array(model.wv['__unk'])
        else:
            word_vec = np.zeros(model.vector_size)
            for word in sent:
                word_vec += np.array(model.wv[word if word in model.wv.index_to_key else '__unk'])
            word_vec = word_vec / sent_len
        new_data.append(word_vec)
    return np.array(new_data)

def create_vocab(data, min_feq=0):
    vocab = collections.defaultdict(int)
    for sent in data:
        for word in sent:
            vocab[word] += 1
    vocab = {k: v for k, v in vocab.items() if v >= min_feq}
    return {k : i for i, (k, v) in enumerate(vocab.items())}


def bow_vectorize(data, vocab):
    vocab_size = len(vocab)
    n_dim = len(data)
    new_data = np.zeros((n_dim, vocab_size))
    idf = np.zeros((n_dim, vocab_size))
    for line, sent in enumerate(data):
        for word in sent:
            x, y = line, vocab.get(word)
            if y:
                new_data[x, y] += 1
    return new_data

def tf_idf_vectorize(data, vocab):
    vocab_size = len(vocab)
    n_dim = len(data)
    new_data = np.zeros((n_dim, vocab_size))
    idf = np.zeros((n_dim, vocab_size))
    for line, sent in enumerate(data):
        sent_len = len(sent)
        for word in sent:
            x, y = line, vocab.get(word)
            if y:
                new_data[x, y] += 1
            idf[x, y] = 1
    idf = idf.sum(axis=0)
    with np.errstate(divide='ignore'):
        idf = np.where(idf == 0, 0, np.log(n_dim / idf))
    new_data = new_data * idf
    return new_data

In [6]:
raw_data = pd.read_csv('./data.csv')

In [7]:
raw_data.dropna(inplace=True)

In [8]:
def remove_empty(data, label):
    for i, sent in enumerate(data):
        if sent == '':
            data.pop(i)
            label.pop(i)
    return data, label

def preprocess(data):
    data_arr = lowercase(data)
    data_arr = reformat_text(data_arr)
    data_arr = remove_url(data_arr)
    data_arr = remove_telex_error(data_arr)
    data_arr = remove_punct_char(data_arr)
    #data_arr = remove_digit_char(data_arr)
    data_arr = strip(data_arr)
    return data_arr

def split_train_test(content, star, n_splits=1, test_size=0.2, random_state=52):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=52)
    for train_index, test_index in split.split(content, star):
        content = np.array(content)
        star = np.array(star)
        content_train, content_test  = content[train_index].tolist(), content[test_index].tolist()
        star_train, star_test = star[train_index].tolist(), star[test_index].tolist()
        
    return content_train, content_test, star_train, star_test

In [9]:
content, star = raw_data.content, raw_data.start

In [10]:
content = content.to_list()
star = star.to_list()

In [11]:
content_train, content_test, star_train, star_test = split_train_test(content, star)

In [18]:
content_train = preprocess(content_train)
content_test = preprocess(content_test)

In [19]:
content_train

['d√¢y ƒëeo h∆°i m·ªèng nh∆∞ng c·∫£m th·∫•y ph√π h·ª£p gi√° ti·ªÅn ƒë√≥ng g√≥p c·∫©n th·∫≠n',
 'thi·∫øu 1 khuy tuy nhi√™n v·∫´n cho 5 sao v√¨ √°o ƒë·∫πp y h√¨nh th√≠ch',
 'c√°i n√†y ch·ªâ gi√†nh cho tu·ªïi m·ªõi l·ªõn m·ªõi m·∫∑c ƒëc ak',
 'nh∆∞ √† ph√≤ng ko m√πi th√¨ ƒë√∫ng h∆°n',
 'ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m ·ªïn',
 'h√†ng b·ªã l·ªói d√πng kh√¥ng ƒë∆∞·ª£c shop ph·∫£n h·ªìi ƒë·ªïi tr·∫£ ƒë·ªÉ m√¨nh s·ª≠a l·∫°i ƒë√°nh gi√° nh√©',
 'cute b√°nh b√®o ‚ù§',
 'l·∫ßn tr∆∞·ªõc mua th√¨ 3 ng√†y m·ªõi nh·∫≠n ƒëc h√†ng nh∆∞ng l·∫ßn n√†y m·ªõi ƒë·∫∑t hqua th√¨ hnay ƒë√£ nh·∫≠n ƒëc h√†ng r√πi d√π nh√† e ·ªü a hihi',
 'ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi ƒë√≥ng g√≥i s·∫£n ph·∫©m r·∫•t ƒë·∫πp v√† ch·∫Øc ch·∫Øn',
 'mua 3 m√†u c√≥ m·ªói m√†u ƒëen chu·∫©n s',
 'ƒë√≥ng g√≥i ch·∫Øc ch·∫Øn v√† giao h√†ng nhanh',
 'ch√°n',
 'ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi chun h∆°i r·ªông 1 ch√∫t',
 'n√™n mua r·ªông h∆°n 1 sie',
 'ƒë∆∞·ªùng may h∆°i ·∫©u ch·ªâ c√≥ v·∫Øt s·ªï',
 'r·∫•t h√†i l√≤ngtuy·ªát

In [20]:
content_train, star_train = remove_empty(content_train, star_train)

In [None]:
index = 0
for sent in content_test:
    if len(sent) == 0:
        print(index)
    index += 1

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin
class Tokenizer(BaseEstimator, TransformerMixin):
    def __init__(self, vocab_size=2000, pct_bpe=1, ngram_min=2, ngram_max=7, min_feq=5, exceptions=['k', '·∫°'], return_vocab=False):
        self.vocab_size = vocab_size
        self.pct_bpe = pct_bpe
        self.ngram_min = ngram_min
        self.ngram_max = ngram_max
        self.min_feq = min_feq
        self.return_vocab = return_vocab
        self.exceptions = exceptions
    
    def fit(self, X, y=None):
#         print('token fit call')
        self.encoder = Encoder(self.vocab_size,
                               pct_bpe=self.pct_bpe,
                               ngram_min=self.ngram_min,
                               ngram_max=self.ngram_max)
        self.encoder.fit(X)
        token = drop_uncommon_character(tokenize(X, self.encoder), self.exceptions)
        return self
    
    def transform(self, X, *args, **kwargs):
#         print('token transfrom call')
        token = tokenize(X, self.encoder)
        if self.return_vocab:
            return token, self.encoder.bpe_vocab
        else:
            return token

    
class WithoutStopWord(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords):
        self.stopwords = stopwords
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, *args, **kwargs):
        data, vocab = X
        data = remove_stopwords(data, self.stopwords)
    
        vocab = {k : v for k, v in vocab.items() if k not in self.stopwords}
        vocab = {k : i for i, (k, v) in enumerate(vocab.items())}
        
        return data
        
    
class Padding(BaseEstimator, TransformerMixin):
    def __init__(self, max_length=20):
        self.max_length = max_length
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, *args, **kwargs): 
        data, vocab = X
        return add_padding(data, self.max_length)
    
class BowVectorizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.vocab = create_vocab(X)
        return self
    
    def transform(self, X, *args, **kwargs):
        return bow_vectorize(X, self.vocab)
        
class TfIdfVectorizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
#         print('tf fit call')
        self.vocab = create_vocab(X)
        return self
    
    def transform(self, X, *args, **kwargs):
#         print('tf transform call')
        return tf_idf_vectorize(X, self.vocab)

        
class W2VEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self,  vector_size=100, min_count=5, sg=True, hs=True, window=10, workers=4):
        self.vector_size = vector_size
        self.min_count = min_count
        self.sg = sg
        self.hs = hs
        self.window = window
        self.workers = workers
        
    def fit(self, X, y=None):
        self.model = gensim.models.Word2Vec(X,
                                            vector_size=self.vector_size,
                                            min_count=self.min_count,
                                            sg=self.sg,
                                            hs=self.hs,
                                            window=self.window,
                                            workers=self.workers)
        return self
    
    def transform(self, X, *args, **kwargs):
        return word2vec_embedding(X, self.model)
            
class ReportResult(BaseEstimator, TransformerMixin):
    def __init__(self, estimator_name):
        self.estimator_name = estimator_name
        
    def fit(self, X, y=None):
        self.predict_label = label(X)
        self.true_label = label(y)
        return self

    def transform(self, X):
        report = classification_report(self.predict_label, self.true_label, output_dict=False)
#         if not os.path.exists('report.json'):
#             with open('report.json', 'w') as f:
#                 json.dump({self.estimator_name : [report]}, f)
#         else:            
#             with open('report.json', 'r') as f:
#                 data = json.load(f)
#                 data[self.estimator_name].append(report)
#                 json.dumps(data)
        return report

In [None]:
def load_stopwords():
    with open('./stopwords-dash.txt', 'r') as f:
        lines = f.readlines()
        stopwords = [line[:-1] for line in lines if ' ' not  in line]
    return stopwords

In [None]:
stopwords = load_stopwords()
stopwords

In [114]:
content_train, star_train = add_no_accent(content_train, star_train)

In [None]:
from sklearn.utils import resample
def upsample(data, label):
    data = np.array(data)
    label = np.array(label)
    data_arr = np.c_[data, label]
    label = data_arr[:, 1]
    pos_label = (label == '4') | (label == '5')
    neu_label = label == '3'
    neg_label = (label == '1') | (label == '2')
    pos_data = resample(data_arr[pos_label], replace=True, n_samples=len(pos_label), random_state=52)
    neu_data = resample(data_arr[neu_label], replace=True, n_samples= int(len(pos_label) / 2), random_state=52)
    neg_data = resample(data_arr[neg_label], replace=True, n_samples=int(len(pos_label) / 4), random_state=52)
    
    data_arr = np.concatenate((pos_data, neu_data, neg_data))
    data_arr = np.random.permutation(data_arr)
    return data_arr[:, 0].tolist(), data_arr[: ,1].astype(np.int8).tolist()

In [None]:
content_train, star_train = upsample(content_train, star_train)

In [106]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
bow_processing = Pipeline([('tokenizer', Tokenizer(8192, 0.5)), ('vectorizer', BowVectorizer())])
tf_idf_processing = Pipeline([('tokenizer', Tokenizer(8192, 0.5)), ('vectorizer', TfIdfVectorizer()), ('dimension_redudction', PCA(n_components=1500))])
w2v_processing = Pipeline([('tokenizer', Tokenizer(3500, 0.7)), ('embedding', W2VEmbedding(vector_size=1000))])

In [None]:
from sklearn.cluster import KMeans
kmeans = Pipeline([('tokenizer', Tokenizer(3500, 0.7)), ('embedding', TfIdfEmbedding()), ('cluster', KMeans(n_clusters=5, random_state=52))])
kmeans.fit(content_train, content_test)

In [None]:
report = ReportResult('Kmean')
print(report.fit_transform(kmeans.predict(content_test), star_test))

In [95]:
from sklearn.naive_bayes import MultinomialNB
multi_nb = Pipeline([('feature_processing', tf_idf_processing), ('estimator', MultinomialNB())])
multi_nb.fit(content_train, star_train)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [None]:
report = ReportResult('MultiNB')
print(report.fit_transform(multi_nb.predict(content_test), star_test))

In [139]:
from sklearn.naive_bayes import BernoulliNB
bernoulli_nb = Pipeline([('feature_processing', bow_processing), ('estimator', BernoulliNB())])
bernoulli_nb.fit(content_train, star_train)

Pipeline(steps=[('feature_processing',
                 Pipeline(steps=[('tokenizer',
                                  Tokenizer(pct_bpe=0.5, vocab_size=8192)),
                                 ('embedding', BowEmbedding())])),
                ('estimator', BernoulliNB())])

In [141]:
report = ReportResult('BernoulliNB')
print(report.fit_transform(bernoulli_nb.predict(content_test), star_test))

              precision    recall  f1-score   support

         NEG       0.56      0.63      0.60      1189
         NEU       0.38      0.34      0.36      1049
         POS       0.85      0.85      0.85      4050

    accuracy                           0.72      6288
   macro avg       0.60      0.61      0.60      6288
weighted avg       0.72      0.72      0.72      6288



In [104]:
from sklearn.linear_model import LogisticRegression
logistic_clf = Pipeline([('feature_processing', tf_idf_processing), ('estimator', LogisticRegression(max_iter=2000))])
logistic_clf.fit(content_train, star_train)

Pipeline(steps=[('feature_processing',
                 Pipeline(steps=[('tokenizer',
                                  Tokenizer(pct_bpe=0.5, vocab_size=8192)),
                                 ('vectorizer', TfIdfVectorizer()),
                                 ('dimension_redudction',
                                  PCA(n_components=1500))])),
                ('estimator', LogisticRegression(max_iter=2000))])

In [101]:
logistic_clf.predict_proba(['x·∫•u v√£i l·ªìn'])

array([[0.14217951, 0.15276737, 0.2689083 , 0.19821674, 0.23792808]])

In [105]:
report = ReportResult('Softmax')
print(report.fit_transform(logistic_clf.predict(content_test), star_test))

              precision    recall  f1-score   support

         NEG       0.63      0.71      0.67      1181
         NEU       0.35      0.36      0.36       932
         POS       0.90      0.86      0.88      4175

    accuracy                           0.76      6288
   macro avg       0.63      0.64      0.64      6288
weighted avg       0.77      0.76      0.76      6288



In [None]:
from sklearn.svm import LinearSVC
multi_svm = Pipeline([('feature_preprocessing', w2v_processing), ('estimator', LinearSVC(multi_class='ovr', max_iter=2000))])
multi_svm.fit(content_train, star_train)

In [None]:
predict = multi_svm.predict(content_test)

In [None]:
report = ReportResult('LinearSVC')
report.fit_transform(predict, star_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
d_tree_clf = Pipeline([('feature_processing', tf_idf_processing), ('estimator', DecisionTreeClassifier())])
d_tree_clf.fit(content_train, star_train)

In [None]:
report = ReportResult('Decision tree')
print(report.fit_transform(d_tree_clf.predict(content_test), star_test))

In [107]:
from sklearn.ensemble import RandomForestClassifier
ran_forest_clf = Pipeline([('feature_processing', w2v_processing), ('estimator', RandomForestClassifier(n_estimators=500, min_samples_leaf=2, n_jobs=-1))])
ran_forest_clf.fit(content_train, star_train)

Pipeline(steps=[('feature_processing',
                 Pipeline(steps=[('tokenizer',
                                  Tokenizer(pct_bpe=0.7, vocab_size=3500)),
                                 ('embedding',
                                  W2VEmbedding(vector_size=1000))])),
                ('estimator',
                 RandomForestClassifier(min_samples_leaf=2, n_estimators=500,
                                        n_jobs=-1))])

In [108]:
report = ReportResult('Random_forest')
print(report.fit_transform(ran_forest_clf.predict(content_test), star_test))

              precision    recall  f1-score   support

         NEG       0.60      0.71      0.65      1126
         NEU       0.35      0.37      0.36       895
         POS       0.90      0.85      0.88      4267

    accuracy                           0.76      6288
   macro avg       0.62      0.64      0.63      6288
weighted avg       0.77      0.76      0.76      6288



In [None]:
from sklearn.model_selection import GridSearchCV
grid_params = [{'padding__max_length': np.arange(10, 100, 10), 'embedding__vector_size': np.arange(10, 100, 10)}]
estimator = Pipeline([('tokenizer', Tokenizer()), ('padding', Padding()), ('embedding', W2VEmbedding()), ('ran_forest_clf', RandomForestClassifier(n_estimators=500, min_samples_leaf=4))])
gs_ran_forest = GridSearchCV(estimator, param_grid=grid_params, scoring='accuracy', cv=5)

In [None]:
gs_ran_forest.fit(content_train, star_train)

In [None]:
report = ReportResult('MultiNB')
print(report.fit_transform(gs_ran_forest.predict(content_test), star_test))

In [None]:
gs_ran_forest.best_params_

In [None]:
gs_mul_nb.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV
grid_params = [{'tokenizer__vocab_size': np.arange(4000, , 100), 'tokenizer__pct_bpe': np.arange(0.5, 1, 0.1)}]
estimator = Pipeline([('tokenizer', Tokenizer()), ('embedding', TfIdfEmbedding()), ('multi_nb', MultinomialNB())])
gs_multi_nb = GridSearchCV(estimator, param_grid=grid_params, scoring='accuracy', cv=5)

In [None]:
gs_multi_nb.fit(content_train, star_train)

In [None]:
report = ReportResult('MultiNB')
print(report.fit_transform(gs_multi_nb.predict(content_test), star_test))

In [None]:
gs_multi_nb.best_params_

In [None]:
num = 10
count = 0
for i, sent in enumerate(content):
    if 'toi' in sent:
        print(sent)
        count += 1
#         if count > num:
#             break

In [1]:
from emoticons import EMOTICONS
EMOTICONS

{'[:;\\\'="]+-?([)>}3]o0^)+': 'th√≠ch',
 '[:;\\\'="]+-?([(<{)])+': 't·ªá',
 '[:;\\\'="]+v': 'b√¨nh th∆∞·ªùng',
 '<3': '‚ù§Ô∏è',
 '\\^[-_]?\\^': 'th√≠ch',
 "[:;\\']+-?x": 'b√¨nh th∆∞·ªùng',
 '~~': 'b√¨nh th∆∞·ªùng',
 '>.?<': 'th√≠ch',
 '=[_.]=': 't·ªá',
 'T[._-]?T': 't·ªá',
 '-[_.]?-': 't·ªá',
 '@[._-]?@': 't·ªá',
 '!+': 'tuy·ªát',
 '\\?[!?]?': 't·ªá'}

In [3]:
re.sub('@[.]?@', 'alo', line)

'Toi rat t·ªá'

In [None]:
sys.path

In [130]:
for i, (sent, star) in enumerate(zip(content, star)):
    if 'ch·∫•t' in sent:
        print(sent, star)

TypeError: 'int' object is not iterable

In [None]:
def remove_url(line):
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?¬´¬ª‚Äú‚Äù‚Äò‚Äô]))''', '', line)

In [108]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]))

PCA(n_components=1)

In [110]:
pca.transform([[3, 4]])

array([[-4.69513088]])

In [122]:
content

['√Åo bao ƒë·∫πp ·∫°!',
 'Tuy·ªát v·ªùi',
 '2day ao khong giong trong',
 'M√πi th∆°m,b√¥i l√™n da m·ªÅm da',
 'V·∫£i ƒë·∫πp, d√†y d·∫∑n',
 'H√†ng r·∫•t ƒë·∫πp, r·∫•t chi l√† ∆∞ng √Ω',
 'Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m t·ªët, date d√†i',
 'ƒÇn n√≥i v√† th√°i ƒë·ªô ph·ª•c v·ª• t·ªët',
 'ƒê√≥ng g√≥i s·∫£n ph·∫©m ch·∫Øc ch·∫Øn',
 't·∫•t s·ªùn h·∫øt ca ch∆∞a d√πng m√† vay r',
 'Shop ph·ª•c v·ª• r·∫•t t·ªët',
 'M·∫∑c th√¨ c≈©ng ƒëc',
 'Ch·∫•t v·∫£i kh·ªèi ch√™',
 'Th·ªùi gian giao h√†ng r·∫•t nhanh',
 'Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi',
 'v·∫£i h∆°i th√¥ c·ª©ng Th·ªùi gian giao h√†ng nhanh',
 'Ch·∫•t l∆∞·ª£ng sp ch∆∞a th·∫≠t s·ª± ƒë·∫πp nhe shop',
 'R·∫•t ƒë√°ng ti·ªÅn Th·ªùi gian giao h√†ng r·∫•t nhanh',
 'Qu·∫ßn r·∫•t ƒë·∫πp m·∫∑c v·ª´a v·∫∑n',
 'C·∫£m gi√°c mua h√†ng b·ªã h·ªõ th·∫≠t t·ªá',
 'Khi mua v·ªÅ n√™n ƒëi s·ª≠a l·∫°i',
 'V·ªõi gi√° n√†y th√¨ s·∫£n ph·∫©m t·∫°m ·ªïn ch∆∞a ƒëc g·ªçi l√† ƒë·∫πp l·∫Øm',
 'R·∫•t ƒë√°ng ti·ªÅn Th·ªùi gian giao h√†ng r·∫•t nhanh Ch·∫•t l∆∞·ª£ng s

In [149]:
from nltk.util import ngrams
from nltk.lm.preprocessing import pad_both_ends

In [118]:
encoder = Encoder(4000, pct_bpe=0.8, ngram_max=7)
encoder.fit(content_train)

In [124]:
content_train

['d√¢y ƒëeo h∆°i m·ªèng nh∆∞ng c·∫£m th·∫•y ph√π h·ª£p gi√° ti·ªÅn ƒë√≥ng g√≥p c·∫©n th·∫≠n',
 'thi·∫øu 1 khuy tuy nhi√™n v·∫´n cho 5 sao v√¨ √°o ƒë·∫πp y h√¨nh th√≠ch',
 'c√°i n√†y ch·ªâ gi√†nh cho tu·ªïi m·ªõi l·ªõn m·ªõi m·∫∑c ƒëc ak',
 'nh∆∞ √† ph√≤ng ko m√πi th√¨ ƒë√∫ng h∆°n',
 'ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m ·ªïn',
 'h√†ng b·ªã l·ªói d√πng kh√¥ng ƒë∆∞·ª£c shop ph·∫£n h·ªìi ƒë·ªïi tr·∫£ ƒë·ªÉ m√¨nh s·ª≠a l·∫°i ƒë√°nh gi√° nh√©',
 'cute b√°nh b√®o ‚ù§',
 'l·∫ßn tr∆∞·ªõc mua th√¨ 3 ng√†y m·ªõi nh·∫≠n ƒëc h√†ng nh∆∞ng l·∫ßn n√†y m·ªõi ƒë·∫∑t hqua th√¨ hnay ƒë√£ nh·∫≠n ƒëc h√†ng r√πi d√π nh√† e ·ªü a hihi',
 'ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi ƒë√≥ng g√≥i s·∫£n ph·∫©m r·∫•t ƒë·∫πp v√† ch·∫Øc ch·∫Øn',
 'mua 3 m√†u c√≥ m·ªói m√†u ƒëen chu·∫©n s',
 'ƒë√≥ng g√≥i ch·∫Øc ch·∫Øn v√† giao h√†ng nhanh',
 'ch√°n',
 'ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi chun h∆°i r·ªông 1 ch√∫t',
 'n√™n mua r·ªông h∆°n 1 sie',
 'ƒë∆∞·ªùng may h∆°i ·∫©u ch·ªâ c√≥ v·∫Øt s·ªï',
 'r·∫•t h√†i l√≤ngtuy·ªát

In [125]:
encoder.tokenize('ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m tuy·ªát v·ªùi ƒë√≥ng g√≥i s·∫£n ph·∫©m r·∫•t ƒë·∫πp v√† ch·∫Øc ch·∫Øn')

['ch·∫•t',
 'l∆∞·ª£ng',
 's·∫£n',
 'ph·∫©m',
 'tuy·ªát',
 'v·ªùi',
 'ƒë√≥ng',
 'g√≥i',
 's·∫£n',
 'ph·∫©m',
 'r·∫•t',
 'ƒë·∫πp',
 'v√†',
 'ch·∫Øc',
 'ch·∫Øn']

In [162]:
def retrieve_ngrams(txt, n):
    return [tuple(txt[i:i+n]) for i in range(len(txt)-(n-1))]

In [120]:
token = tokenize(content_train, encoder)

In [185]:
token = [retrieve_ngrams(sent, n=2) for sent in token]

In [52]:
tf_idf = TfIdfVectorizer()

In [53]:
qwe = tf_idf.fit_transform(token)

In [54]:
qwe

array([[0.        , 6.41884228, 2.65198604, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [51]:
len(create_vocab(token))

2607

In [80]:
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
pca.fit_transform(qwe)

array([[-0.70078761, -0.01557748,  0.16278588, ...,  0.14387661,
         0.47759624,  0.56090423],
       [-1.51401563,  0.28112652,  1.85482028, ..., -0.02688408,
         0.65530188,  0.69226693],
       [-1.65157098,  0.24040771,  1.74091094, ...,  0.68216447,
        -0.61166961,  0.09808958],
       ...,
       [-0.98935517, -0.5163185 ,  0.63236895, ...,  0.29055158,
         0.38132899, -0.04160633],
       [-1.27579119, -0.17026756,  2.0014487 , ..., -0.18733704,
        -0.49977641,  0.06103064],
       [-1.21451542, -0.46919771, -0.98330039, ..., -0.01138177,
        -0.18915744, -0.21099735]])

In [66]:
result = []
for feature in range(qwe.shape[-1]):
    result.append(np.corrcoef(qwe[:, feature], np.array(star_train)))
result = {k : v for k, v in enumerate(result)}


TypeError: 'builtin_function_or_method' object is not iterable

In [75]:
result = dict(sorted(result.items(), key=lambda item: item[1][0,1], reverse=True))

In [63]:
np.corrcoef(qwe[:, 0], star_train)

  c /= stddev[:, None]
  c /= stddev[None, :]


array([[nan, nan],
       [nan,  1.]])

In [76]:
result

{0: array([[nan, nan],
        [nan,  1.]]),
 26: array([[1.        , 0.29313892],
        [0.29313892, 1.        ]]),
 93: array([[1.        , 0.25333768],
        [0.25333768, 1.        ]]),
 94: array([[1.        , 0.25261839],
        [0.25261839, 1.        ]]),
 96: array([[1.        , 0.24485047],
        [0.24485047, 1.        ]]),
 51: array([[1.       , 0.2044893],
        [0.2044893, 1.       ]]),
 52: array([[1.       , 0.2038997],
        [0.2038997, 1.       ]]),
 107: array([[1.        , 0.20291119],
        [0.20291119, 1.        ]]),
 99: array([[1.       , 0.1996583],
        [0.1996583, 1.       ]]),
 98: array([[1.        , 0.18996207],
        [0.18996207, 1.        ]]),
 171: array([[1.        , 0.18580321],
        [0.18580321, 1.        ]]),
 11: array([[1.        , 0.17355873],
        [0.17355873, 1.        ]]),
 95: array([[1.      , 0.172162],
        [0.172162, 1.      ]]),
 50: array([[1.        , 0.17024626],
        [0.17024626, 1.        ]]),
 97: array(

In [79]:
tf_idf.vocab

{'d√¢y': 0,
 'ƒëeo': 1,
 'h∆°i': 2,
 'm·ªèng': 3,
 'nh∆∞ng': 4,
 'c·∫£m': 5,
 'th·∫•y': 6,
 'ph√π': 7,
 'h·ª£p': 8,
 'gi√°': 9,
 'ti·ªÅn': 10,
 'ƒë√≥ng': 11,
 'g√≥p': 12,
 'c·∫©n': 13,
 'th·∫≠n': 14,
 'thi·∫øu': 15,
 '1': 16,
 'khuy': 17,
 'tuy': 18,
 'nhi√™n': 19,
 'v·∫´n': 20,
 'cho': 21,
 '5': 22,
 'sao': 23,
 'v√¨': 24,
 '√°o': 25,
 'ƒë·∫πp': 26,
 'y': 27,
 'h√¨nh': 28,
 'th√≠ch': 29,
 'c√°i': 30,
 'n√†y': 31,
 'ch·ªâ': 32,
 'gi√†': 33,
 'nh': 34,
 'tu·ªïi': 35,
 'm·ªõi': 36,
 'l·ªõn': 37,
 'm·∫∑c': 38,
 'ƒëc': 39,
 'ak': 40,
 'nh∆∞': 41,
 '√†': 42,
 'ph√≤ng': 43,
 'ko': 44,
 'm√πi': 45,
 'th√¨': 46,
 'ƒë√∫ng': 47,
 'h∆°n': 48,
 'ch·∫•t': 49,
 'l∆∞·ª£ng': 50,
 's·∫£n': 51,
 'ph·∫©m': 52,
 '·ªïn': 53,
 'h√†ng': 54,
 'b·ªã': 55,
 'l·ªói': 56,
 'd√πng': 57,
 'kh√¥ng': 58,
 'ƒë∆∞·ª£c': 59,
 'shop': 60,
 'ph·∫£n': 61,
 'h·ªìi': 62,
 'ƒë·ªïi': 63,
 'tr·∫£': 64,
 'ƒë·ªÉ': 65,
 'm√¨nh': 66,
 's·ª≠a': 67,
 'l·∫°i': 68,
 'ƒë√°nh': 69,
 'nh√©': 70,
 'cute': 71,
 'b√°nh': 72,
 'b√®o': 73,
 '‚ù

In [85]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(qwe, star_train)

LinearDiscriminantAnalysis()

In [86]:
test = tokenize(content_test, encoder)
test = tf_idf.transform(test)

In [113]:
w2v = W2VEmbedding(vector_size=300)
w2v.fit(token)

W2VEmbedding(vector_size=300)

In [116]:
w2v.model.wv.most_similar('ƒë·∫πp')

[('v·ªù', 0.6161928772926331),
 ('ch·∫Øn', 0.6022834181785583),
 ('r·∫•t', 0.5944746136665344),
 ('ch·∫•t', 0.5929152965545654),
 ('v·ªùi', 0.5882195234298706),
 ('h·ªãn', 0.5881131887435913),
 ('tuy·ªát', 0.5857023000717163),
 ('d·∫πp', 0.5839597582817078),
 ('ch·∫Øc', 0.5815643072128296),
 ('‚ô•', 0.5734853148460388)]

In [127]:
for sent in token:
    if 'v·ªù' in sent:
        print(sent)

['ch·∫•t', 'l∆∞·ª£ng', 's·∫£n', 'ph·∫©m', 'tuy·ªát', 'v·ªù', 'i', 'r·∫•t', 'ƒë√°ng', 'ti·ªÅn']
['ch·∫•t', 'l∆∞·ª£ng', 's·∫£n', 'ph·∫©m', 'tuy·ªát', 'v·ªù', 'i', 'ƒë√≥ng', 'g√≥i', 's·∫£n', 'ph·∫©m', 'r·∫•t', 'ƒë·∫πp', 'v√†', 'ch·∫Øc', 'ch·∫Øn', 'th·ªùi', 'gian', 'giao', 'h√†ng', 'r·∫•t', 'nhanh']
['ch·∫•t', 'l∆∞·ª£ng', 's·∫£n', 'ph·∫©m', 'tuy·ªát', 'v·ªù', 'r·∫•t', 'ƒë√°ng', 'ti·ªÅn', 'th·ªùi', 'gian', 'giao', 'h√†ng', 'r·∫•t', 'nhanh', 'shop', 'ph·ª•c', 'v·ª•', 'r·∫•t', 't·ªët']
['mang', 'r·∫•t', '∆∞ng', 'gi√°', 'v·ª´a', 't√∫i', 'ti·ªÅn', 'ch·∫•t', 'l∆∞·ª£ng', 's·∫£n', 'ph·∫©m', 'tuy·ªát', 'v·ªù']
['ch·∫•t', 'l∆∞·ª£ng', 's·∫£n', 'ph·∫©m', 'tuy·ªát', 'v·ªùi', 'ƒë√≥ng', 'g√≥i', 's·∫£n', 'ph·∫©m', 'r·∫•t', 'ƒë·∫πp', 'v√†', 'ch·∫Øc', 'ch·∫Øn', 'ch·∫•t', 'l∆∞·ª£ng', 's·∫£n', 'ph·∫©m', 'tuy·ªát', 'v·ªù']
['ch·∫•t', 'l∆∞·ª£ng', 's·∫£n', 'ph·∫©m', 'tuy·ªát', 'v·ªù', 'i', 'ƒë√≥ng', 'g√≥i', 's·∫£n', 'ph·∫©m', 'r·∫•t', 'ƒë·∫πp', 'v√†', 'ch·∫Øc', 'ch·∫Øn', 'shop', 'ph·ª•c', 'v·ª•', 'r·∫•t', 't·ªët