# 1. Setup

## Import and read dataset

In [None]:
# link: https://drive.google.com/file/d/1ydGNBdRVloX9rtxsKrMSnUNFG43Qv1sl/view?usp=sharing
!gdown --id 1ydGNBdRVloX9rtxsKrMSnUNFG43Qv1sl
!unzip news_corpus.zip

## Define normalize text function and create vocab

In [None]:
# download vietnamese stopwords: https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords.txt
!gdown --id 1W9zVRz--bHlbBXbCSmoWHBO_2Cs4EhPY
!unzip vn_stopwords.zip

Downloading...
From: https://drive.google.com/uc?id=1W9zVRz--bHlbBXbCSmoWHBO_2Cs4EhPY
To: /content/vn_stopwords.zip
100% 6.89k/6.89k [00:00<00:00, 9.77MB/s]
Archive:  vn_stopwords.zip
  inflating: vietnamese-stopwords.txt  


In [None]:
import string
import os
import re
import unicodedata
import numpy as np

from tqdm import tqdm

def remove_punctuations(text: str) -> str:
  return re.sub(r'[^\w\sàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ]', ' ', text)

def remove_email(text: str) -> str:
  return re.sub(r'\S*@\S*\s?', '', text)

def remove_url(text: str) -> str:
  return re.sub(r'http\S+', '', text)

with open('vietnamese-stopwords.txt', 'r', encoding='utf8') as f:
  vn_stopwords = f.read().splitlines()

def remove_stopwords(text: str) -> str:
  new_text = text
  for w in vn_stopwords:
    new_text = re.sub(f'\s{w}\s', ' ', new_text)

  return new_text

def normalize_text(text: str) -> str:
  normalized_text = text.lower()
  normalized_text = unicodedata.normalize('NFKC', normalized_text)
  normalized_text = remove_email(normalized_text)
  normalized_text = remove_url(normalized_text)
  normalized_text = remove_punctuations(normalized_text)
  normalized_text = remove_stopwords(normalized_text)

  return normalized_text

## Create similiarity measurement function using cosine similarity

$cosine\_similarity(a, b) = \frac{a ⋅ b}{|a||b|} = \frac{\sum_{i = 1}^{N}a_ib_i}{\sqrt{\sum_{i = 1}^{N}a_i^2}\sqrt{\sum_{i = 1}^{N}b_i^2}}$

In [None]:
def distance(a: np.ndarray, b: np.ndarray) -> np.float64:
  numerator = np.dot(a, b)
  denominator = np.linalg.norm(a) * np.linalg.norm(b)

  return numerator / denominator

## Create vectorize function using bag-of-words on a provided vocab

In [None]:
def vectorize(text: str, vocab: list) -> np.ndarray:
  normalized_text = normalize_text(text)
  vec = []
  for word in vocab:
    vec.append(normalized_text.count(word))

  return np.array(vec)

# 2. Building Text Retrieval system using Vector Space Model



## 2.1. Create vocab

In [None]:
doc_lists = []
vocab = []
dataset_root_path = 'news_corpus'
filenames = os.listdir(dataset_root_path)
for i in tqdm(range(len(filenames) // 200)):
  filename = filenames[i]
  filepath = os.path.join(dataset_root_path, filename)
  with open(filepath, 'r', encoding='utf8') as f:
    lines = list(filter(None, f.read().splitlines()))
    title = unicodedata.normalize('NFKC', lines[0].strip())
    article = ' '.join(lines[1:]).strip()
    article = normalize_text(article)
    if article == '':
      continue
    else:
      if (title, article) not in doc_lists:
        doc_lists.append((title, article))
      tokens = list(filter(None, article.split(' ')))
      for token in tokens:
        if token not in vocab:
          vocab.append(token)

100%|██████████| 922/922 [03:12<00:00,  4.79it/s]


In [None]:
print(f'Vocab size: {len(vocab)}')
print(f'Number of docs: {len(doc_lists)}')

Vocab size: 9368
Number of docs: 855


## 2.2. Create document-term matrix

In [None]:
doc_term_matrix = {}
for (title, article) in tqdm(doc_lists):
  vec = vectorize(article, vocab)
  doc_term_matrix[(title, article)] = vec

100%|██████████| 855/855 [02:51<00:00,  4.97it/s]


## 2.3. Ranking

In [None]:
def ranking(query: str, doc_term_matrix: dict, print_top_10: bool = True) -> list:
  query_vec = vectorize(query, vocab)
  rankings = []
  i = 1
  for doc_info, vec in doc_term_matrix.items():
    score = distance(query_vec, vec)
    rankings.append((score, (doc_info[0])))
    i += 1
  rankings.sort(reverse=True)

  if print_top_10 == True:
    for rank in rankings[:10]:
      print(rank)

  return rankings

In [None]:
query = "điểm thi đại học"
rankings = ranking(query, doc_term_matrix, True)

(0.6670716806549295, 'Đảm bảo cơ sở vật chất trước ngày thi tốt nghiệp')
(0.6546724335902323, "Sẽ xử lý nghiêm vụ nữ sinh dùng thiết bị điện tử 'tuồn' đề thi tốt nghiệp THPT")
(0.6388767231972037, 'Cả nước có 38 thí sinh là F0 dự thi tốt nghiệp THPT năm 2022')
(0.6342339280242492, 'Bộ trưởng Nguyễn Kim Sơn thị sát thi tốt nghiệp THPT tại Huế | Báo Dân trí')
(0.6315782475591162, 'Đề thi môn Giáo dục công dân kỳ thi tốt nghiệp THPT 2022 | VTV.VN')
(0.6314463180720822, 'Vì sao Lệnh Hồ Xung có Độc Cô Cửu Kiếm vẫn bại trước Đông Phương Bất Bại?')
(0.6284002689891311, 'U19 Việt Nam 4-1 U19 Philippines (H2): Cú sút panenka tuyệt vời')
(0.6281683710970668, 'Bất chấp bão giá và nợ nần, giới trẻ chi tiền nhiều hơn để đi chơi: Mua nhà, mua xe đã hết "vui"?')
(0.6056558126751928, "Will Smith: 'Chris Rock chưa sẵn sàng nói chuyện với tôi'")
(0.6029601607415258, "Đại diện Việt Nam phối hợp đăng cai tổ chức cuộc thi tranh biện quốc tế World Scholar's Cup")
