<a href="https://colab.research.google.com/github/Kalit31/IR-Assignment/blob/main/index_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [16]:
pip install beautifulsoup4



In [17]:
!pip install jsonpickle



In [27]:
import bs4 as bs 
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import json
import jsonpickle
import string
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
'''
Intialize data structures

vocab{
  0 : term0  //bits
  1: term1
}

rev_vocab{
  term0.word :term0
  term1.word :term1
}

all_docs: stores all the documents found in the file

inv_index: stores the inverted index for the corpus

'''

vocab = {}
rev_vocab ={}
all_docs = {}
inv_index = {}

In [29]:
def clean_text(file_text):
  '''
    Takes a text as input and returns a list of splitted tokens, excluding punctuations

    eg: s='Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\n Thanks.'
        returns ['Good', 'muffins', 'cost', '3.88', 'in', 'New', 'York','Please', 'buy', 'me', 'two', 'of', 'them', 'Thanks']
  '''

  #Split text into tokens
  tokens=nltk.tokenize.word_tokenize(file_text)
  final_tokens=[]
  for token in tokens:
    # Add into final_tokens after lower casing the token if it is not a punctuation symbol
    if(token not in string.punctuation):
      token=token.lower()
      final_tokens.append(token)
  return final_tokens

In [30]:
'''
  Document class structure:
  id:  doc ID
  doc_name: document title
  url: document url
  tokens: document text splitted into tokens
  tf: term frequency vector for the document
'''

class document:
  def __init__(self,tag,id):
    self.id = id
    self.doc_name = tag["title"]
    self.url = tag["url"]
    self.tokens=clean_text(tag.get_text())
    self.tf = np.zeros((len(vocab),1))
  
  def create(self):
    for token in self.tokens:
      token_id = rev_vocab[token].id;
      self.tf[token_id]=self.tf[token_id]+1
      if(len(inv_index[token])!=0 and inv_index[token][-1]==self.id):
        continue
      inv_index[token].append(self.id)

In [31]:
'''
  Term class structure:
  id: id assigned to word
  word: original word
'''

class term:
  def __init__(self,id,word):
    self.id=id
    self.word=word
    

In [32]:
def create_vocab_dicts(doc_text):
  '''
  creates vocabulary and reverse-vocabulary using tokens tokens returned from clean_text function
    vocab{
      0 : term0
      1: term1
    }

    rev_vocab{
      term0.word :term0
      term1.word :term1
    }
  '''
  tokens = clean_text(doc_text)
  for token in tokens:
    if(token in rev_vocab.keys()): 
      continue
    term_obj=term(len(vocab),token)
    vocab[term_obj.id]=term_obj
    rev_vocab[term_obj.word]=term_obj
    inv_index[term_obj.word]=[]

In [33]:
def parse_docs(filename):
  '''
    Takes filename as input, extracts text from it.
    Parses the documents found into appropriate objects 
  '''

  file = open(filename, "r")
  readBytes = 100000
  data = file.read(readBytes)

  soup = bs.BeautifulSoup(data,'html.parser')
  all_doc_tags = soup.find_all('doc')

  print(str(len(all_doc_tags))+" documents found in the file.")

  # Create vocabulary for the corpus
  for each_tag in all_doc_tags:
    create_vocab_dicts(each_tag.get_text())

  #Store each document object
  for i,each_tag in enumerate(all_doc_tags):
    all_docs[i] = document(each_tag,i)
    all_docs[i].create()
  

In [34]:
filePath = "/content/drive/MyDrive/IR_Files/wiki_00"

In [35]:
parse_docs(filePath)

57 documents found in the file.


In [36]:
#Save files
def saveFiles(obj,fileName):
  frozen=jsonpickle.encode(obj)
  with open(fileName, 'w') as f:
    json.dump(frozen,f)

In [37]:
saveFiles(all_docs,'/content/drive/MyDrive/IR_Files/documents.json')
saveFiles(vocab,'/content/drive/MyDrive/IR_Files/vocabulary.json')
saveFiles(rev_vocab,'/content/drive/MyDrive/IR_Files/reverse-vocabulary.json')
saveFiles(inv_index,'/content/drive/MyDrive/IR_Files/inverted-index.json')