<h1 style = 'text-align: center'> <b>Week 03: Text Feature Extraction</b> </h1>

- Name: Võ Nguyễn Hoàng Kim
- Mentee ID: 240103

In [1]:
pip install underthesea

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m37.5

In [2]:
from __future__ import division  # Python 2 users only
import nltk
import re
from nltk.corpus import stopwords
import json
import underthesea
from underthesea import word_tokenize
from underthesea import text_normalize
import random
from sklearn.feature_extraction.text import TfidfTransformer

## 1. Read Data

In [3]:
def loadRawData_JSON(file_path, encoding = 'utf-8'):
    with open(file_path, 'r', encoding = encoding,) as file:
        return json.load(file)
    
def loadStopWords_TXT(file_path, encoding = 'utf-8'):
    with open(file_path, 'r', encoding = encoding) as file:
        return file.read().splitlines()
    
def getArticleContent(article):
    key_json = ['title', 'content']
    # get content and title of each article
    content = ''
    content = '\n'.join(str(article[key]) for key in key_json)
    return content

raw_data_path = "/kaggle/input/vietnamese-online-news-dataset/news_dataset.json"
stop_words_path = "/kaggle/input/vietnamese-stop-words/vietnamese-stopwords.txt"
    
# load raw data and stop words from files
raw_data = loadRawData_JSON(raw_data_path)
stop_words = loadStopWords_TXT(stop_words_path)
    
# get content from articles
articles_content = [getArticleContent(article) for article in raw_data]    

## 2. Preprocess Text 

In [4]:
# normalize text 
def normalizeText(article):
    return text_normalize(article)

# remove unnecessary punctuation 
def cleanText(article):    
    # remove several punctuation such as ; , ( ) [ ] ... 
    return re.sub(r'[^\w\s/:.-]', '', article)

# remove stop word
def removeStopWords(article, stop_words):
    # using regex with re.sub to remove stop word (better than using String replace)
    stop_words_pattern = r'\b(?:' + '|'.join(map(re.escape, stop_words)) + r')\b'
    cleaned_text = re.sub(stop_words_pattern, '', article)
    # remove redundant spaces
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

# tokenize by underthesea
def tokenization(article):
    return word_tokenize(article)

def prepocessText(articles_content, stop_words):
    processed_articles = []
    for article in articles_content:
        # lower case
        article = article.lower()
        # normalize text
        normalized_text = normalizeText(article)
        # clean text
        cleaned_text = cleanText(normalized_text)
        # remove stop words
        removedStopWords_text = removeStopWords(cleaned_text, stop_words)
        # tokenize text
        tokens = tokenization(removedStopWords_text)        
        # re-clean tokens to remove unnecessary characters like '-', '.', etc.
        cleaned_tokens = [token for token in tokens if re.search(r'[a-zA-Z0-9]', token)]
        # add tokens in result list
        processed_articles.append(cleaned_tokens)
    return processed_articles

# get a number of article as sample to process (~1000 article)
sample_contents = random.sample(articles_content, 1000)

# preprocess samples
processedArticles = prepocessText(sample_contents, stop_words)

## 3. Text Feature Extraction

### a. Bag of Words with Unigram

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import numpy as np
from scipy.sparse import csr_matrix

def build_BagOfWords_Model(processed_articles):
    # flatten list of tokens from all articles
    # using Counter for creating vocab
    all_tokens = [token for article in processed_articles for token in article]
    
    # build vocab from all tokens
    vocabulary = list(set(all_tokens))      # remove duplicated tokens 
    
    total_bow_vetors = []
    
    # create bow vector for each article
    for article in processed_articles:
        # create a counter for tokens in the articles
        token_counter = Counter(article)
        
        # create bow vector (return the frequency of a token in vocabulary, 0 is default)
        bow_vector = [token_counter.get(token, 0) for token in vocabulary]
        
        # add bow vector into the total list
        total_bow_vetors.append(bow_vector)
        
        
    return np.array(total_bow_vetors), vocabulary 
        
def convertArrayToSparse(np_array):
    return csr_matrix(np_array)

bow_vectors, vocabulary = build_BagOfWords_Model(processedArticles)


In [6]:
print(bow_vectors.shape)
print(vocabulary[:100])
print(bow_vectors[:100])

(1000, 30941)
['mv tác phẩm', 'hầu quan văn võ lui', 'nghiêm nhờn', 'gia tài', 'phế khát', 'trọng thần', 'điện mê', 'hạ sinh', 'bọt khí', 'vịnh', '9999', 'https://phunuvietnam.vn/vi--dong-y-noi-an--cam-bo-phan-nao-phi-phan--dau-la-pha', 'trình báo', 'mv', 'địa vận động', 'bezos', 'lượn mặt', 'gái hữu', 'bantan', 'kích hoạt quy trình', 'phnom', 'quyết định', 'sh', 'lùm xùm', 'chất clc', 'chiến dân', '6', 'ubnd', 'sống thân', 'thuyên góp', 'thị giác', 'rơi trời', 'lo toan', '27/7 ronaldo', 'thủy lực', 'ea hiao', 'khai khoáng', 'báo hiệu', 'sagamihara', 'tập quán', 'truy thăng', 'bảo đảm', 'dự đoán', 'bỉ', '6815', 'liên lạc', 'đăng trang', 'trung quốc kích', 'công tố viên', 'ios', 'liệu tài', 'chậm trễ', 'trung đại', 'na-98968', 'ngoại hành tinh', 'e-contracts', 'đói', 'hoạch định', 'chí phát ngôn', 'trung văn', 'kiến đạo', 'di sản', 'thủ đông', '977', 'am kinh', 'đề nguyện vọng', 'găng độ', 'máy nhân', 'giải ổn', 'hoa hậu tâm huyết', 'florida', 'mỹ ao', 'hạn chế', '3.605', 'varbergs', 's

### b. TF-IDF

In [7]:
def build_TFIDF_Model(frequency_matrix):
    # init TF-IDF Transformer
    tfidf_transformer = TfidfTransformer()
    # fit & transform from the frequency matrix
    tfidf_matrix = tfidf_transformer.fit_transform(frequency_matrix)  
    return tfidf_matrix

In [8]:
tfidf_matrix = build_TFIDF_Model(bow_vectors)
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### c. Word2Vec

In [9]:
from gensim.models import Word2Vec

# get all tokens from documents
doc_tokens = processedArticles
# doc_tokens[0]

# init Word2Vec model
model = Word2Vec(doc_tokens, vector_size=100, window=5, min_count=1, workers=4)

# apply the model on a doc_tokens to vectorize all its tokens
vector = model.wv[doc_tokens[0]]

In [10]:
# find similar word with the document
finding_word = 'công ty'
similar_words = model.wv.most_similar(finding_word)
print(similar_words)

[('cổ', 0.9993972182273865), ('tập đoàn', 0.9993129968643188), ('tài sản', 0.9992716908454895), ('hộ', 0.9992150068283081), ('hóa', 0.9991352558135986), ('dịch vụ', 0.9991027116775513), ('thu', 0.9990847110748291), ('thương mại', 0.9990014433860779), ('tặng', 0.9989170432090759), ('phát hành', 0.9989066123962402)]


### d. Bag of Words with Bigram

In [11]:
def build_BagOfWords_Model_Bigram(processed_articles, ngram=2):
    # flatten list of tokens from all articles and create n-grams
    all_ngrams = []
    
    for article in processed_articles:
        # create bigram word for each article
        ngrams = [tuple(article[i:i + ngram]) for i in range(len(article) - ngram + 1)]
        all_ngrams.extend(ngrams)

    # build vocab from all n-grams
    vocabulary = list(set(all_ngrams))  # remove duplicated n-grams
    
    total_bow_vectors = []
    
    # create bow vector for each article
    for article in processed_articles:
        # create bigram for each article
        token_counter = Counter(tuple(article[i:i + ngram]) for i in range(len(article) - ngram + 1))
        
        # create bow vector (return the frequency of a token in vocabulary, 0 is default)
        bow_vector = [token_counter.get(gram, 0) for gram in vocabulary]
        
        # add bow vector into the total list
        total_bow_vectors.append(bow_vector)
        
    return np.array(total_bow_vectors), vocabulary

bow_vectors_bigram, vocabulary_bigram = build_BagOfWords_Model_Bigram(processedArticles, ngram=2)


In [14]:
print(vocabulary_bigram[:500])
bow_vectors_bigram[0][:1000]

[('huân chương', 'hoa cúc'), ('bố', 'thành'), ('hồ', 'viết'), ('đất', 'san ủi'), ('bụng', 'buồn nôn'), ('nguy chìm', 'đắm'), ('phòng kiến', 'thiết kế'), ('trưởng', 'khoa học'), ('2-7 ngoại', 'đi'), ('mẹ', 'dặn dò'), ('sơn', 'hà nhạc'), ('kế hoạch', 'thân thông'), ('trở', 'đường'), ('thpt', 'trần'), ('phương thức', 'đăng ký'), ('trở đẹp', 'trừ'), ('phan hùng shb', 'đà nẵng'), ('yêu', 'thạc sĩ'), ('ứng phó', 'kinh nghiệm'), ('1', 'tiên'), ('mấy', 'giá'), ('lương', '18'), ('báo', 'ấp'), ('tư vấn', 'giải đáp'), ('độc thân', 'hai'), ('khóc mắt', 'chuyên trị'), ('ldp', 'hội trường'), ('tổng thống', 'zelensky 13/6'), ('căm thù', 'giặc'), ('bàng quang', 'trầm trọng'), ('the', 'medium'), ('hành', 'kiểm tra'), ('sân', 'chủ động'), ('hải', 'bí thư'), ('trải nghiệm', 'chút'), ('yếu tố', 'địa lý'), ('hai', 'đương nhiên chúa'), ('giáo dục', 'gia đình'), ('tổ chức', 'trái phép'), ('kiến thức', 'phân hóa'), ('khung thành', 'nam định'), ('hy vọng', 'u23'), ('tỉ lệ', 'bao phủ'), ('who', '7'), ('nghệ thu

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,