<h1 style = 'text-align: center'> <b>Week 04: Text Embedding</b> </h1>

- Name: Võ Nguyễn Hoàng Kim
- Mentee ID: 240103

In [1]:
!pip install underthesea
import underthesea
from gensim.models import Word2Vec
import random
from underthesea import word_tokenize
from underthesea import text_normalize
import re



# 1. Read data

In [2]:
# open data
def readData(file_path, encoding='utf-8'):
    with open(file_path, 'r', encoding = encoding) as file:
        return file.read().splitlines()
    
wiki_doc = readData("/kaggle/input/cleaned-viwik18-dataset/clean_viwik18.txt")
stop_words = readData("/kaggle/input/vietnamese-stop-words/vietnamese-stopwords.txt")

# 2. Preprocess text 

In [3]:
# normalize text 
def normalizeText(article):
    return text_normalize(article)

# remove unnecessary punctuation 
def cleanText(article):    
    # remove several punctuation such as ; , ( ) [ ] ... 
    return re.sub(r'[^\w\s/:.-]', '', article)

# remove stop word
def removeStopWords(article, stop_words):
    # using regex with re.sub to remove stop word (better than using String replace)
    stop_words_pattern = r'\b(?:' + '|'.join(map(re.escape, stop_words)) + r')\b'
    cleaned_text = re.sub(stop_words_pattern, '', article)
    # remove redundant spaces
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

# tokenize by underthesea
def tokenization(article):
    return word_tokenize(article)

def prepocessText(articles_content, stop_words):
    processed_articles = []
    for article in articles_content:
        # lower case
        article = article.lower()
        # normalize text
        normalized_text = normalizeText(article)
        # clean text
        cleaned_text = cleanText(normalized_text)
        # remove stop words
        removedStopWords_text = removeStopWords(cleaned_text, stop_words)
        # tokenize text
        tokens = tokenization(removedStopWords_text)        
        # re-clean tokens to remove unnecessary characters like '-', '.', etc.
        cleaned_tokens = [token for token in tokens if re.search(r'[a-zA-Z0-9]', token)]
        # add tokens in result list
        processed_articles.append(cleaned_tokens)
    return processed_articles

In [4]:
# take 1000 random sentences to make the sample data
sample_sentence = wiki_doc[:1000]

# preprocess the sample data
sample_tokens = prepocessText(sample_sentence, stop_words)
sample_tokens[:10]

[['trang'],
 ['internet', 'society'],
 ['internet',
  'society isoc',
  'tổ chức',
  'quốc tế',
  'hoạt động',
  'phi lợi nhuận',
  'phi phủ',
  'bao',
  'thành viên',
  'trình độ',
  'chuyên ngành',
  'tổ chức',
  'trọng tiêu chuẩn',
  'giáo dục',
  'sách',
  'bốn',
  'tổ chức',
  'thành viên',
  'sáu',
  'thành viên',
  'isoc',
  'bao',
  'cộng đồng',
  'internet',
  'chi tiết thể',
  'website',
  'isoc'],
 ['internet',
  'society',
  'nằm',
  'thủ đô',
  'washington',
  'dc',
  'hoa',
  'kỳ geneva',
  'thụy sĩ',
  'hội viên',
  'bao',
  'bốn',
  'tổ chức',
  'thành viên',
  'sáu',
  'thành viên',
  'thể lập',
  'chi nhánh',
  'tổ chức',
  'tùy sở',
  'tổ chức',
  'chín',
  'chi nhánh',
  'toàn',
  'giới'],
 ['nhiệm vụ', 'mục đích', 'hoạt động'],
 ['bảo đảm',
  'cổ vũ',
  'phát triển',
  'rộng',
  'internet',
  'thuận lợi',
  'toàn',
  'giới'],
 [],
 ['lịch sử', 'internet'],
 ['liên kết'],
 ['isoc', 'việt nam'],
 ['trang', 'chủ', 'toàn cầu'],
 ['ietf',
  'and the',
  'internet',
  's

# 3. Word2Vec

In [5]:
# init the Word2Vec model with sample data
model = Word2Vec(sample_tokens, vector_size = 300, window = 5, min_count = 1, workers = 4)

In [6]:
# vectorlize words in a doc
word_vectors = model.wv[sample_tokens[3]]

# using average word2vec
avg_word2vec = word_vectors.mean(axis=0)

len(word_vectors), word_vectors, avg_word2vec

(25,
 array([[ 0.00707596,  0.01674215, -0.00166502, ..., -0.01390744,
          0.00977892, -0.00228943],
        [-0.00214355,  0.00294003,  0.00099316, ..., -0.00412652,
          0.00320609, -0.00089672],
        [ 0.0291292 ,  0.05989324, -0.00465175, ..., -0.04576759,
          0.03807134, -0.02072483],
        ...,
        [ 0.00172768,  0.00057977, -0.00138901, ..., -0.00563658,
         -0.00063227, -0.00150506],
        [ 0.01294287,  0.03052992, -0.00213772, ..., -0.02475156,
          0.01994115, -0.01044608],
        [ 0.0267437 ,  0.06290844, -0.00099102, ..., -0.05427884,
          0.04390847, -0.0238958 ]], dtype=float32),
 array([ 0.01710799,  0.03775597, -0.00203975,  0.01951701, -0.00117384,
        -0.03525405,  0.02620503,  0.0696658 ,  0.01434422, -0.01277342,
         0.00139536, -0.02434053, -0.00651818,  0.01014628, -0.02311089,
        -0.02029221,  0.01262633, -0.00881075,  0.01312512, -0.00689695,
        -0.02493167, -0.00075642,  0.02551161,  0.00065795,  

In [7]:
# find similar word in model
similar_words = model.wv.most_similar("thành phố")
print(similar_words)

[('hai', 0.9991716146469116), ('sài gòn', 0.9991375803947449), ('hoa kỳ', 0.9991369247436523), ('việt nam', 0.9991288185119629), ('đất', 0.9991065263748169), ('thành', 0.9990755319595337), ('thụy điển', 0.9990288019180298), ('tám', 0.9990015029907227), ('khu vực', 0.9989915490150452), ('bảy', 0.9989913702011108)]


In [8]:
# save model 
model.save('/kaggle/working/word2vec_viwik18.model')
# load pre-trained model
model = Word2Vec.load('/kaggle/working/word2vec_viwik18.model')