<a href="https://colab.research.google.com/github/JonasVerbickas/test-jupyter/blob/main/to_rename.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
class StringWithDocId:
  def __init__(self, string, doc_id):
    self.string = string
    self.doc_id = doc_id
  
  def __lt__(token_with_doc_A, token_with_doc_B):
    if token_with_doc_A.string == token_with_doc_B.string:
      return token_with_doc_A.doc_id < token_with_doc_B.doc_id
    else:
      return token_with_doc_A.string < token_with_doc_B.string
    
  def __str__(self):
    return f"{self.string}: {self.doc_id}"

  def __repr__(self):
    return f"{self.string}: {self.doc_id}"

In [10]:
STOP_WORDS = ["the", "is", "am", "are", "will", "was", "were"]
WINDOW_SIZE = 5

class InvertedIndex:
    """
    Construct Inverted Index
    """
    def __init__(self):
        self.inverted_index = {}

    def __getitem__(self, key):
        return self.inverted_index[key]
      
    def keys(self):
      return self.inverted_index.keys()
        
    def read_data(self, path: str) -> list:
        """
        Read files from a directory and then append the data of each file into a list.
        """
        output = []
        for file in os.listdir(path):
          with open(os.path.join(path, file), 'r') as f:
            output.append(StringWithDocId(f.read(), file))
        print(len(output))
        return output

    def process_document(self, document: str) -> list:
        """
        pre-process a document and return a list of its terms
        str->list"""
        split_into_words = document.split(" ")
        output = []
        for word in split_into_words:
          if len(word) == 0:
            raise Exception()
          as_lowercase = word.lower()
          if as_lowercase not in STOP_WORDS:
            output.append(as_lowercase)
        return output
    
    def index_corpus(self, documents: list) -> None:
        """
        index given documents
        list->None"""
        token_list = []
        for doc in documents:
          curr_doc_id = doc.doc_id
          processed_string = self.process_document(doc.string)
          for token in processed_string:
            token_with_doc_obj = StringWithDocId(token, curr_doc_id)
            token_list.append(token_with_doc_obj)
        sorted_token_list = sorted(token_list)

        for token_with_doc_id in sorted_token_list:
          if token_with_doc_id.string in self.inverted_index:
            if not token_with_doc_id.doc_id in self.inverted_index[token_with_doc_id.string]:
              self.inverted_index[token_with_doc_id.string].append(token_with_doc_id.doc_id)
          else:
            self.inverted_index[token_with_doc_id.string] = [token_with_doc_id.doc_id]
     
    def proximity_search(self, term1: str, term2: str) -> dict:
        """
        1) check whether given two terms appear within a window
        2) calculate the number of their co-existance in a document
        3) add the document id and the number of matches into a dict
        return the dict"""
        documents_containing_both_terms = []
        term1_i = 0
        term2_i = 0
        while term1_i < len(self[term1]) and term2_i < len(self[term2]):
          if self[term1][term1_i] == self[term2][term2_i]:
            documents_containing_both_terms.append(self[term1][term1_i])
            term1_i += 1
            term2_i += 1
          else:
            if self[term1][term1_i] < self[term2][term2_i]:
              term1_i += 1
            else:
              term2_i += 1
        return documents_containing_both_terms
    
    

In [11]:
def main():
    "main call function"
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data('/content/drive/MyDrive/Colab Notebooks/Simpsons2022') # specify the directory path in which files are located
    index.index_corpus(corpus) # index documents/corpus
    
    search_term = input("Enter your query: ") # insert a query
    # write a demo to check entered search terms against the inverted index
        # 1) len(search _term) == one --> return the following: 
            # a) the number of documents in which a term appears.
            # b) all document ids in which a term appears.
    split_into_words = search_term.split(" ")
    if len(split_into_words) == 1:
      print(split_into_words)
      if split_into_words[0] in index.keys():
        documents_matching_the_query = index[split_into_words[0]]
        print(f"The term appears in *{len(documents_matching_the_query)}* documents")
        print("Document IDs are:", documents_matching_the_query)
      else:
        print("Index does not contain the value")

        # 2) len(search_term) == 2 --> return the following: 
            # a) the number of documents in which the entered terms appear within a pre-defined window.
            # b) all document ids in which the terms appear within that window.
    if len(split_into_words) == 2:
      documents_matching_the_query = index.proximity_search(*split_into_words)
      print(f"The combination of these two term appears in *{len(documents_matching_the_query)}* documents")
      print("Document IDs are:", documents_matching_the_query)

    return index
    
index = main()

120
Enter your query: stylish
['stylish']
Index does not contain the value
