In [None]:
import os
import random
import lxml
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from typing import List, Dict, Tuple, Union
import string
# nltk.download('punkt')
# nltk.download('stopwords')

ORIGINAL_PATH = os.path.join(os.getcwd(), 'Dataset')
ALTERED_PATH = os.path.join(os.getcwd(), 'DatasetAlter')


In [None]:
import os
import random
word_list = ['Number', 'of', 'words', 'in', 'a', 'document', 'is', 'not', 'fixed', 'and', 'can', 'vary', 'from', 'document', 'to', 'document', 'but', 'the',
			 'average', 'number', 'of', 'words', 'in', 'a', 'document', 'is', 'around', '200', 'to', '300', 'words', 'and', 'the', 'maximum', 'number', 'of', 'words', 'in', 'a', 'document', 'is', 'around', '1000', 'words']
path = os.path.join(os.getcwd(), 'DS2')
for i2 in range(10):
	random.seed(i2)
	random_indexes = random.sample(range(0, len(word_list)), 10)
	filename1 = "a" + str(i2+1).zfill(2)

	with open(os.path.join(path, filename1), 'w') as f:
		for j2 in random_indexes:
			f.write(word_list[j2] + " ")

In [None]:
class BigramIndex:
	def __init__(self, path: str) -> None:
		"""
		Initialize the BigramIndex object.
		:param path: The path to the collection of documents.
		"""

		self.docs: int = 0
		self.index: Dict[str, Tuple[int, List[str]]] = {}
		self.PATH: str = path
		self.buildIndex()


	def buildIndex(self) -> None:
		"""
		Build the bigram inverted index by processing each document in the collection.
		Process includes lower-casing, tokenizing, removing stopwords, punctuations and blank spaces.
		"""

		for filename in os.listdir(self.PATH):
			self.docs += 1

			with open(os.path.join(self.PATH, filename), 'r') as f:
				content = f.read()

			# Tokenize and lower case
			tokens = word_tokenize(content.lower())
			# Remove stopwords
			tokens = [token for token in tokens if token not in stopwords.words("english")]
			# Remove punctuation
			tokens = [token.translate(str.maketrans("", "", string.punctuation)) for token in tokens]
			# Remove blank spaces
			tokens = [token for token in tokens if token.strip()]

			# Add biwords to index
			for b in range(len(tokens) - 1):
				biword = tokens[b] + " " + tokens[b+1]
				if biword not in self.index:
					self.index[biword] = (0, [filename])
				else:
					self.index[biword][1].append(filename)

			# Sort unique postings
			for token in self.index:
				self.index[token] = (len(self.index[token][1]), sorted(list(set(self.index[token][1]))))

		print("Finished Building Index")


	def getPostingList(self, term: str) -> Tuple[int, List[str]]:
		"""
		Get the posting list for a term.
		:param term: The term to get the posting list for.
		:return: The posting list for the term as well as the frequency of the term in the collection.
		"""

		return self.index[term] if term in self.index else (0, [])

	def singleWordPostingList(self, term: str) -> Tuple[int, List[str]]:
		"""
		Get the posting list for a term.
		:param term: The term to get the posting list for.
		:return: The posting list for the term as well as the frequency of the term in the collection.
		"""

		posting_list = []
		# Search all keys in index which contain the term and append the posting list to the list
		for key in self.index:
			if term in key:
				posting_list.append(self.index[key][1])

		# Sort unique postings
		posting_list = sorted(list(set(posting_list)))

		return len(posting_list), posting_list


	def getTotalDocs(self) -> int:
		"""
		Get the total number of documents in the collection.
		:return: The total number of documents in the collection.
		"""

		return self.docs


	def queryAND(self, term1: Union[str, List[str]], term2: Union[str, List[str]]) -> Tuple[List[str], int]:
		"""
		Perform a boolean AND query on the bigram inverted index.
		:param term1: The first term to query or a posting list.
		:param term2: The second term to query or a posting list.
		:return: A list of document names that contain both terms and the number of comparisons made.
		"""

		# Check if terms are in inverted index
		if not isinstance(term1, list) and term1 not in self.index:
			return [], 0
		if not isinstance(term2, list) and term2 not in self.index:
			return [], 0
		if isinstance(term1, list) and len(term1) == 0:
			return [], 0
		if isinstance(term2, list) and len(term2) == 0:
			return [], 0


		postings1 = self.index[term1][1] if isinstance(term1, str) else term1
		postings2 = self.index[term2][1] if isinstance(term2, str) else term2

		# Perform AND
		comparisons = 0
		result = []
		i = 0
		j = 0
		while i < len(postings1) and j < len(postings2):
			comparisons += 1
			if postings1[i] == postings2[j]:
				result.append(postings1[i])
				i += 1
				j += 1
			elif postings1[i] < postings2[j]:
				i += 1
			else:
				j += 1

		return result, comparisons


	def queryProcess(self, query: str) -> Tuple[List[str], int]:
		"""
		Parse a query and perform the appropriate boolean query. Only AND query is supported.
		:param query: The query to parse, should be of the form "A B AND B C AND C D" or a similar combination.
		:return: List of document names that match the query and the number of comparisons made.
		"""

		result = []
		comparisons = 0

		# Split query into AND subqueries
		and_subqueries = query.split(" ")

		if len(and_subqueries) == 1:
			return self.singleWordPostingList(and_subqueries[0])[1], 0

		bigram_subqueries = []
		for s in range(len(and_subqueries) - 1):
			bigram_subqueries.append(and_subqueries[s] + " " + and_subqueries[s+1])

		# Process each AND subquery
		for subquery in bigram_subqueries:
			# If result is empty, get posting list for first term
			if len(result) == 0:
				result = self.index[subquery][1] if subquery in self.index else []
				comparisons += 0
			# Otherwise, perform AND query
			else:
				result, subcomparisons = self.queryAND(result, subquery)
				comparisons += subcomparisons

		return result, comparisons

