<a href="https://colab.research.google.com/github/HungPham2002/Text-Retrieval-Using-Pretrained-Embedding/blob/main/Text_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download MS Macro Dataset

In [None]:
!pip install datasets==2.13.1



# Load MS_MARCO

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('ms_marco', 'v1.1')



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
subnet = dataset['test']

# Extract text

In [None]:
# Only use sample with type  = entity

corpus = []
for sample in subnet:
  query_type = sample['query_type']
  if query_type != 'entity':
    continue
  else:
    query_id = sample['query_id']
    query_str = sample['query']
    passage_dict = sample['passages']
    is_selected_lst = passage_dict['is_selected']
    passage_text_lst = passage_dict['passage_text']

    corpus  += passage_text_lst

In [None]:
# Load text and append to corpus

In [None]:
# query_id = sample['query_id']
# query_str = sample['query']
# passage_dict = sample['passages']
# is_selected_lst = passage_dict['is_selected']
# passage_text_lst = passage_dict['passage_text']

# corpus  += passage_text_lst

# Normalization

In [None]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

# lowercase
def lowercase(text):
  return text.lower()

# puntuation
remove_chars = string.punctuation
def remove_punctuation(text):
  for char in remove_chars:
    text = text.replace(char, '')

  return text

# stopwords
stopwords_lst = stopwords.words('english')
def remove_stopwords(text):
  tokens = tokenize(text)
  non_stopwords_lst = [
      token for token in tokens \
        if token not in stopwords_lst
  ]
  new_text = ' '.join(non_stopwords_lst)
  return new_text

# stemming
stemmer = PorterStemmer()
def stemming(text):
  tokens = tokenize(text)
  stemmed_lst = [
      stemmer.stem(token) for token in tokens
  ]
  new_text = ' '.join(stemmed_lst)

  return new_text

In [None]:
def text_normalize(text):
  text = lowercase(text)
  text = remove_punctuation(text)
  text = remove_stopwords(text)
  text = stemming(text)
  return text

# Create Dictionary (bag-of-word)

In [None]:
def tokenize(text):
  return text.split()

def create_dictionary(corpus):
  dictionary = []
  for doc in corpus:
    noramalized_doc = text_normalize(doc)
    tokens = tokenize(noramalized_doc)
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)
  return dictionary

In [None]:
dictionary = create_dictionary(corpus)

# Vectorizer

In [None]:
def vectorize (text, dictionary):
  word_count_dict = {word: 0 for word in dictionary}
  normalized_text = text_normalize(text)
  tokens = tokenize(normalized_text)
  for token in tokens:
    try:
      word_count_dict[token] += 1
    except:
      pass
  vector = list(word_count_dict.values())

  return vector

# Indexing Code

In [None]:
def create_doc_term_matrix(corpus, dictionary):
  doc_term_matrix = {}
  for idx, doc in enumerate(corpus):
    vector  = vectorize(doc, dictionary)
    doc_term_matrix[(doc, idx)] = vector

  return doc_term_matrix

In [None]:
doc_term_matrix = create_doc_term_matrix(corpus, dictionary)

# Cosine Similarity

In [None]:
from scipy import spatial

In [None]:
def similarity(a, b):
  return 1-spatial.distance.cosine(a, b)

In [None]:
def ranking(query, dictionary, doc_term_matrix):
  query_vec = vectorize(query, dictionary)
  scores = []
  for doc_info, doc_vec in doc_term_matrix.items():
    sim = similarity(query_vec, doc_vec)
    scores.append((sim, doc_info))
  scores.sort(reverse = True)

  return scores


In [None]:
query_lst = ['what condition leads to rain']
top_k = 10
for query in query_lst:
  scores = ranking(query, dictionary, doc_term_matrix)
  print(f'Query: {query}')
  print('===== Relevant Docs ====')
  for idx in range(top_k):
    doc_score = scores[idx][0]
    doc_content = scores[idx][1][0]

    print(f'Top {idx + 1}; Score: {doc_score: .4f}')
    print(doc_content)
    print('\n')

Query: what condition leads to rain
===== Relevant Docs ====
Top 1; Score:  0.7107
SectionsLatest Update: 07/21/97 Rain and Hail Atmospheric conditions that lead to the development of rain and hail. Freezing Rain A detailed look at freezing rain, associated dangers and the conditions that lead to its development. Sleet Atmospheric conditions that lead to the development of sleet. 


Top 2; Score:  0.3897
Hypothermia occurs when your body loses heat faster than it produces it. The most common causes of hypothermia are exposure to cold-weather conditions or cold water. But prolonged exposure to any environment colder than your body can lead to hypothermia if you aren't dressed appropriately or can't control the conditions. Specific conditions leading to hypothermia include: 1  Wearing clothes that aren't warm enough for weather conditions. 2  Staying out in the cold too long.


Top 3; Score:  0.3586
• PB (noun). The noun PB has 1 sense: 1. a soft heavy toxic malleable metallic element; b