<a href="https://colab.research.google.com/github/Kalit31/IR-Assignment/blob/main/test_queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [137]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [138]:
!pip install jsonpickle



In [141]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import json
import jsonpickle
import string
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [142]:
'''
  Document class structure:
  id:  doc ID
  doc_name: document title
  url: document url
  tokens: document text splitted into tokens
  tf: term frequency vector for the document
'''

class document:
  def __init__(self,tag,id):
    self.id = id
    self.doc_name = tag["title"]
    self.url = tag["url"]
    self.tokens=clean_text(tag.get_text())
    self.tf = np.zeros((len(vocab),1))
  
  def create(self):
    for token in self.tokens:
      token_id = rev_vocab[token].id;
      self.tf[token_id]=self.tf[token_id]+1
      if(len(inv_index[token])!=0 and inv_index[token][-1]==self.id):
        continue
      inv_index[token].append(self.id)

In [143]:
'''
  Term class structure:
  id: id assigned to word
  word: original word
'''

class term:
  def __init__(self,id,word):
    self.id=id
    self.word=word
    

In [144]:
def openFile(filePath):
  f=open(filePath)
  thawed=json.load(f)
  return jsonpickle.decode(thawed)

In [145]:
all_docs=openFile('/content/drive/MyDrive/IR_Files/documents.json')
vocab=openFile('/content/drive/MyDrive/IR_Files/vocabulary.json')
rev_vocab=openFile('/content/drive/MyDrive/IR_Files/reverse-vocabulary.json')
inv_index=openFile('/content/drive/MyDrive/IR_Files/inverted-index.json')

In [146]:
def clean_text(file_text):
  '''
    Takes a text as input and returns a list of splitted tokens, excluding punctuations

    eg: s='Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\n Thanks.'
        returns ['Good', 'muffins', 'cost', '3.88', 'in', 'New', 'York','Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']
  '''

  #Split text into tokens
  tokens=nltk.tokenize.word_tokenize(file_text)
  final_tokens=[]
  for token in tokens:
    # Add into final_tokens after lower casing the token if it is not a punctuation symbol
    if(token not in string.punctuation):
      token=token.lower()
      final_tokens.append(token)
  return final_tokens

In [147]:
'''
  Query class structure:
  query_tokens:  tokens in the query text
  query_vector: document title
'''

class query:
  def __init__(self,phrase):
    self.query_tokens = clean_text(phrase)
    self.query_vector=np.zeros((len(vocab),1))
    self.get_query_vector()

  def get_query_vector(self):
    query_tokens=self.query_tokens
    for token in query_tokens:
      if(token not in rev_vocab.keys()):
        continue
      token_id=rev_vocab[token].id
      self.query_vector[token_id]+=1


In [148]:
def calc_idf():
  '''
    For each term present in the vocabulary, the idf score is calculated 
    using the formula: idf = (total number of documents)/(number of documents in which the term is present)
  '''
  idf_vector = np.zeros((len(vocab),1))
  N=len(all_docs)
  for i in range(len(vocab)):
    idf_vector[i] = N/len(inv_index[vocab[str(i)].word])

  idf_vector = np.log10(idf_vector)
  return idf_vector

In [149]:
idf_vector = calc_idf()

In [150]:
def calc_i(vector):
  return np.maximum(1+np.log10(vector),np.zeros(vector.shape) ) 

In [151]:
def cal_lnc_ltc(doc_vector,query_vector):
  '''
  calc_i
  normalise

  calc_i
  *idf_vector
  normalise

  dot
  '''
  l_doc_vector = calc_i(doc_vector)
  c_doc_vector = l_doc_vector/np.linalg.norm(l_doc_vector)
 
  l_query_vector=calc_i(query_vector)
  t_query_vector=np.multiply(l_query_vector,idf_vector)
  c_query_vector=t_query_vector/np.linalg.norm(t_query_vector)
 
  score=np.dot(c_doc_vector.reshape(c_doc_vector.shape[0]),c_query_vector.reshape(c_query_vector.shape[0]))
  return score

In [152]:
q1 = query('Ocean')

In [153]:
def ret_ranked_docs():
  '''
    Retrieves the top 10 document with respect to the lnc.ltc scoring scheme
  '''
  
  sorted_docs=[]
  for doc in all_docs.values():
    score = cal_lnc_ltc(doc.tf,q1.query_vector)
    sorted_docs.append([score, doc.id])
  sorted_docs.sort(reverse=True)
  retrieve_cnt = min(len(all_docs),10)
  return sorted_docs[:retrieve_cnt]

In [154]:
ret_ranked_docs()

  


[[0.04984691878042628, 0],
 [0.0, 56],
 [0.0, 55],
 [0.0, 54],
 [0.0, 53],
 [0.0, 52],
 [0.0, 51],
 [0.0, 50],
 [0.0, 49],
 [0.0, 48]]