Q1

In [1]:
import os
import nltk
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def preprocess_text(text, print_steps=False):

    if print_steps:
        print("Original text:\n", text) 
    
    text = text.lower()
    if print_steps:
        print("\nAfter lowercase:\n", text)
    
    tokens = word_tokenize(text)
    if print_steps:
        print("\nAfter tokenization:\n", ' '.join(tokens))  
    
    tokens = [token for token in tokens if token.isalpha()]
    if print_steps:
        print("\nAfter removing punctuation and blank space tokens:\n", ' '.join(tokens))
    
    tokens = [token for token in tokens if token not in stop_words]
    if print_steps:
        print("\nAfter removing stopwords:\n", ' '.join(tokens))
    
    processed_text = ' '.join(tokens)
    return processed_text

folder_path = '/Users/mj/Desktop/Work/Sem6/IR/Ass1/text_files'
new_folder_path = '/Users/mj/Desktop/Work/Sem6/IR/Ass1/newtextfiles'
sample_files = random.sample(range(1, 1000), 5)  

for i in range(1, 1000):
    file_path = os.path.join(folder_path, f'file{i}.txt')
    new_file_path = os.path.join(new_folder_path, f'newfile{i}.txt')
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            print_steps = i in sample_files
            if print_steps:
                print(f"\nProcessing and printing transitions for file{i}.txt...")
            processed_text = preprocess_text(text, print_steps=print_steps)
        with open(new_file_path, 'w', encoding='utf-8') as new_file:
            new_file.write(processed_text)



Processing and printing transitions for file136.txt...
Original text:
 Just Hung up one of my Strats. Quickly mounted on the wall above my piano and it seems to be quite secure. I have some heavier guitars and for them I would probably use Toggle bolts instead of the included Screws and mollies but for most of my lighter weight Fenders this is perfect.
I just ordered a second one to go over my bar, I a glad to have found this item.

After lowercase:
 just hung up one of my strats. quickly mounted on the wall above my piano and it seems to be quite secure. i have some heavier guitars and for them i would probably use toggle bolts instead of the included screws and mollies but for most of my lighter weight fenders this is perfect.
i just ordered a second one to go over my bar, i a glad to have found this item.

After tokenization:
 just hung up one of my strats . quickly mounted on the wall above my piano and it seems to be quite secure . i have some heavier guitars and for them i would

Q2

In [2]:
import os
import pickle

def create_inverted_index(folder_path):
    inverted_index = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                for word in file.read().split():
                    if word not in inverted_index:
                        inverted_index[word] = [filename]
                    elif filename not in inverted_index[word]:
                        inverted_index[word].append(filename)
    return inverted_index

folder_path = '/Users/mj/Desktop/Work/Sem6/IR/Ass1/newtextfiles'
inverted_index = create_inverted_index(folder_path)


In [3]:
with open('inverted_index.pkl', 'wb') as outfile:
    pickle.dump(inverted_index, outfile)


In [4]:
with open('inverted_index.pkl', 'rb') as infile:
    loaded_inverted_index = pickle.load(infile)


In [5]:
def query_and(doc_list1, doc_list2):
    return list(set(doc_list1) & set(doc_list2))

def query_or(doc_list1, doc_list2):
    return list(set(doc_list1) | set(doc_list2))

def query_and_not(doc_list1, doc_list2):
    return list(set(doc_list1) - set(doc_list2))

def query_or_not(doc_list1, doc_list2, all_docs):
    return list((set(all_docs) - set(doc_list2)) | set(doc_list1))


In [6]:
def execute_query(inverted_index, query_terms, operations):
    result = inverted_index.get(query_terms[0], [])
    all_docs = [file for file in inverted_index.values() for file in file]
    
    for i, operation in enumerate(operations):
        next_term_docs = inverted_index.get(query_terms[i + 1], [])
        if operation == 'AND':
            result = query_and(result, next_term_docs)
        elif operation == 'OR':
            result = query_or(result, next_term_docs)
        elif operation == 'AND NOT':
            result = query_and_not(result, next_term_docs)
        elif operation == 'OR NOT':
            result = query_or_not(result, next_term_docs, all_docs)
    return result


In [10]:
import pickle

with open('inverted_index.pkl', 'rb') as infile:
    inverted_index = pickle.load(infile)


def main():
    N = int(input("Enter number of queries: "))
    for i in range(N):
        query = input("Enter query: ")
        operations = input("Enter operations: ").split(', ')
        
        preprocessed_query = preprocess_text(query)
        query_terms = preprocessed_query.split() 
    
        results = execute_query(inverted_index, query_terms, operations)
        results = sorted(results)
        
        print(f"Query {i+1}: {' '.join([query_terms[0]] + [op + ' ' + term for op, term in zip(operations, query_terms[1:])])}")
        print(f"Number of documents retrieved for query {i+1}: {len(results)}")
        print(f"Names of the documents retrieved for query {i+1}: {', '.join(results)}")

if __name__ == "__main__":
    main()

Query 1: coffee AND brewing OR NOT techniques OR cookbook
Number of documents retrieved for query 1: 999
Names of the documents retrieved for query 1: newfile1.txt, newfile10.txt, newfile100.txt, newfile101.txt, newfile102.txt, newfile103.txt, newfile104.txt, newfile105.txt, newfile106.txt, newfile107.txt, newfile108.txt, newfile109.txt, newfile11.txt, newfile110.txt, newfile111.txt, newfile112.txt, newfile113.txt, newfile114.txt, newfile115.txt, newfile116.txt, newfile117.txt, newfile118.txt, newfile119.txt, newfile12.txt, newfile120.txt, newfile121.txt, newfile122.txt, newfile123.txt, newfile124.txt, newfile125.txt, newfile126.txt, newfile127.txt, newfile128.txt, newfile129.txt, newfile13.txt, newfile130.txt, newfile131.txt, newfile132.txt, newfile133.txt, newfile134.txt, newfile135.txt, newfile136.txt, newfile137.txt, newfile138.txt, newfile139.txt, newfile14.txt, newfile140.txt, newfile141.txt, newfile142.txt, newfile143.txt, newfile144.txt, newfile145.txt, newfile146.txt, newfile1

Q3

In [18]:
import os
import pickle
from nltk.tokenize import word_tokenize
from collections import defaultdict

def create_positional_index(folder_path):
    positional_index = defaultdict(dict)  
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                words = word_tokenize(file.read().lower())
                words = [word for word in words if word.isalpha() and word not in stop_words]
                for position, word in enumerate(words):
                    if word not in positional_index:
                        positional_index[word] = defaultdict(list)
                    positional_index[word][filename].append(position)
    return positional_index

def save_positional_index(index, file_name='positional_index.pkl'):
    with open(file_name, 'wb') as outfile:
        pickle.dump(index, outfile)

def load_positional_index(file_name='positional_index.pkl'):
    with open(file_name, 'rb') as infile:
        return pickle.load(infile)

def phrase_query_search(index, query):
    query_words = word_tokenize(query.lower())
    if not query_words:
        return []
    if len(query_words) == 1:
        return list(index.get(query_words[0], {}).keys())
   
    docs = set(index[query_words[0]].keys())
    
    for i, word in enumerate(query_words[1:], 1):
        docs = docs.intersection({doc for doc in index[word].keys() if any(pos - i in index[query_words[0]][doc] for pos in index[word][doc])})
    return list(docs)

def main():
    folder_path = '/Users/mj/Desktop/Work/Sem6/IR/Ass1/newtextfiles'
    positional_index = create_positional_index(folder_path)
    save_positional_index(positional_index)
    loaded_index = load_positional_index()
    
    N = int(input("Enter number of phrase queries: "))
    for i in range(N):
        query = input(f"Enter phrase query {i+1}: ")
        preprocessed_query = preprocess_text(query)
        results = phrase_query_search(loaded_index, preprocessed_query)
        print(f"Query {i+1}: '{preprocessed_query}'")
        print(f"Number of documents retrieved: {len(results)}")
        if results:
            print(f"Documents containing the phrase: {', '.join(results)}")
        else:
            print("No documents contain the given phrase.")

if __name__ == "__main__":
    main()


Query 1: 'acoustic bass'
Number of documents retrieved: 2
Documents containing the phrase: newfile3.txt, newfile279.txt
