In [1]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pickle
from itertools import product

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nalishjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nalishjain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))

    tokens = [word for word in tokens if word.lower() not in stop_words]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word.strip() != '']
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

for id in range(1, 1000):
    input_file_path = "/Users/nalishjain/Documents/GitHub/CSE508_Winter2024_A1_2021543/text_files/file" + str(id) + ".txt" 
    output_file_path = "/Users/nalishjain/Documents/GitHub/CSE508_Winter2024_A1_2021543/out_files/file" + str(id) + "processed" + ".txt" 
    if id <= 5:

        with open(input_file_path, 'r', encoding='utf-8') as file:
            print("Text before pre-processing")
            text = file.read()
            print(text)  
        preprocessed_text = preprocess_text(text)

        print("Text after pre-processing")
        print(preprocessed_text)
        print()
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(preprocessed_text) 

    else:    
        with open(input_file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        preprocessed_text = preprocess_text(text)
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(preprocessed_text) 

Text before pre-processing
Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Text after pre-processing
loving vintage springs vintage strat good tension great stability floating bridge want springs way go

Text before pre-processing
Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too.
Text after pre-processing
works great guitar bench mat rugged enough abuse take care take care makes organization workspace much easier screws wo n't roll around color good

Text before pre-processing
We use these for everything from our acoustic bass down to our ukuleles. I know there is a smaller model available for ukes, violins, etc.; we haven't yet ordered those, but these will work on smaller instr

In [3]:
def create_index(file_path):
    index_dict = {}
    for id in range(1, 1000):
        processed_file_path = file_path + str(id) + "processed" + ".txt" 
        with open(processed_file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        words = text.split(" ")
        words = set(words)
        for word in words:
            if word not in index_dict:
                index_dict[word] = set()
                index_dict[word].add(id)
            else:
                index_dict[word].add(id)  
    return index_dict

def create_positional_index(file_path):
    position_index_dict = {}
    for id in range(1, 1000):
        processed_file_path = file_path + str(id) + "processed" + ".txt" 
        with open(processed_file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        words = text.split(" ")
        for position, word in enumerate(words):
            if word not in position_index_dict:
                position_index_dict[word] = {}
            if id not in position_index_dict[word]:
                position_index_dict[word][id] = []
            position_index_dict[word][id].append(position)

    return position_index_dict
    

In [4]:
index_dict = create_index(file_path="/Users/nalishjain/Documents/GitHub/CSE508_Winter2024_A1_2021543/out_files/file")
with open("index_dictionary.pkl", 'wb') as pickle_file:
    pickle.dump(index_dict, pickle_file)

file_path = 'index_dictionary.pkl'
with open(file_path, 'rb') as pickle_file:
    loaded_dict = pickle.load(pickle_file)
print("Loaded Dictionary:", loaded_dict)

position_index_dict = create_positional_index("/Users/nalishjain/Documents/GitHub/CSE508_Winter2024_A1_2021543/out_files/file")
with open("position_index_dictionary.pkl", 'wb') as pickle_file:
    pickle.dump(position_index_dict, pickle_file)

file_path = 'position_index_dictionary.pkl'
with open(file_path, 'rb') as pickle_file:
    loaded_dict = pickle.load(pickle_file)
print("Loaded Dictionary:", loaded_dict)



In [5]:
def take_input(n):
    terms_list = []
    operators_list = []
    for i in range(n):
        input_user = input("Enter sentence : ")
        # print(input_user)
        input_user = preprocess_text(input_user).split(" ")
        terms_list.append(input_user)

        input_user = input("Enter operator : ")
        input_user = input_user.split(",")
        operators_list.append(input_user)
        operators_list = [[word.strip() for word in sublist] for sublist in operators_list]
    return (terms_list, operators_list)
        
def take_input_positional(n):
    terms_list = []
    # operators_list = []
    for i in range(n):
        input_user = input("Enter sentence : ")
        # print(input_user)
        input_user = preprocess_text(input_user).split(" ")
        terms_list.append(input_user)
    return (terms_list)

In [6]:
def execute_query(terms, operators, inverted_index, total_doc_num):
    result = None

    query_string = ""
    for i, word in enumerate(terms):
        query_string += word
        if i < len(terms) - 1:
            query_string += f' {operators[i % len(operators)]} '
            
    for i in range(0, len(terms)):
        term = terms[i]
        if result is None:
            result = inverted_index.get(term, set())
            operator = ""
        else:
            operator = operators[i-1]
            if operator == 'AND':
                result = result.intersection(inverted_index.get(term, set()))

            elif operator == 'OR':
                result = result.union(inverted_index.get(term, set()))

            elif operator == 'AND NOT':
                total_docs = set(range(1, total_doc_num)) 
                not_term = total_docs.difference(inverted_index.get(term, set()))
                result = result.intersection(not_term)

            elif operator == 'OR NOT':
                total_docs = set(range(1, total_doc_num)) 
                not_term = total_docs.difference(inverted_index.get(term, set()))
                result = result.union(not_term)
        # print(result)
    print("*"*20)
    print("Query is : ", query_string)
    print(f"Number of documents retrieved: {len(result)}")
    print(f"Names of the documents retrieved:", ["file" + str(id) +".txt" for id in sorted(result)])
    print("*"*20)
    
def cartesian_product(*lists):
    if not lists:
        return [()]
    else:
        result = []
        for product_tail in cartesian_product(*lists[1:]):
            for item in lists[0]:
                result.append((item,) + product_tail)
        return result

def generate_combinations(lists):
    all_combinations = cartesian_product(*lists)
    valid_combinations = [comb for comb in all_combinations if all(comb[i] - comb[i-1] == 1 for i in range(1, len(comb)))]  
    return valid_combinations

def execute_positional_query(terms, positional_index):
    result = None
    
    for term in terms:
        if result is None:
            result = set(positional_index.get(term, {}).keys())
        else:
            result = result.intersection(positional_index.get(term, {}).keys())

    for id in list(result):
        positions = [positional_index[term][id] for term in terms]
        combinations = generate_combinations(positions)
        if len(combinations) != 0:
            continue
        else:
            result.remove(id)
    print("*"*20)
    print("Queey : ",  terms)
    print(f"Number of documents retrieved: {len(result)}")
    print(f"Names of the documents retrieved:", ["file" + str(id) +".txt" for id in sorted(result)])
    print("*"*20)

In [10]:
n = int(input("Enter the number of queries : "))
terms_list, operators_list = take_input(n)
# print(terms_list,operators_list)
for query_index in range(n):
    execute_query(terms_list[query_index], operators_list[query_index], index_dict, 1000)

********************
Query is :  great AND boy
Number of documents retrieved: 1
Names of the documents retrieved: ['file461.txt']
********************
********************
Query is :  nalish OR NOT dhruv
Number of documents retrieved: 999
Names of the documents retrieved: ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt', 'file5.txt', 'file6.txt', 'file7.txt', 'file8.txt', 'file9.txt', 'file10.txt', 'file11.txt', 'file12.txt', 'file13.txt', 'file14.txt', 'file15.txt', 'file16.txt', 'file17.txt', 'file18.txt', 'file19.txt', 'file20.txt', 'file21.txt', 'file22.txt', 'file23.txt', 'file24.txt', 'file25.txt', 'file26.txt', 'file27.txt', 'file28.txt', 'file29.txt', 'file30.txt', 'file31.txt', 'file32.txt', 'file33.txt', 'file34.txt', 'file35.txt', 'file36.txt', 'file37.txt', 'file38.txt', 'file39.txt', 'file40.txt', 'file41.txt', 'file42.txt', 'file43.txt', 'file44.txt', 'file45.txt', 'file46.txt', 'file47.txt', 'file48.txt', 'file49.txt', 'file50.txt', 'file51.txt', 'file52.txt', 'file53

In [17]:
n = int(input("Enter the number of queries : "))
terms_list = take_input_positional(n)
for query_index in range(n):
    execute_positional_query(terms_list[query_index], position_index_dict)

********************
Queey :  ['excellent']
Number of documents retrieved: 42
Names of the documents retrieved: ['file5.txt', 'file648.txt', 'file521.txt', 'file907.txt', 'file656.txt', 'file913.txt', 'file274.txt', 'file786.txt', 'file918.txt', 'file412.txt', 'file796.txt', 'file297.txt', 'file553.txt', 'file682.txt', 'file172.txt', 'file429.txt', 'file48.txt', 'file944.txt', 'file690.txt', 'file819.txt', 'file180.txt', 'file441.txt', 'file706.txt', 'file962.txt', 'file326.txt', 'file202.txt', 'file459.txt', 'file76.txt', 'file332.txt', 'file335.txt', 'file726.txt', 'file348.txt', 'file604.txt', 'file738.txt', 'file99.txt', 'file230.txt', 'file614.txt', 'file108.txt', 'file501.txt', 'file631.txt', 'file762.txt', 'file127.txt']
********************


In [134]:
result = execute_positional_query(['works', 'great', 'guitar'], position_index_dict)
print(result)

[[0], [1], [2]]
[(0, 1, 2)]
{2}
