In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

In [3]:
folder_path ='/content/drive/MyDrive/IR dataset/text_files'
os.chdir(folder_path)

In [4]:
# List all text files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.txt')]


In [5]:
# Display the list of file paths
#file_paths

In [6]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re


In [7]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# Create a new directory for preprocessed files
preprocessed_folder = '/content/drive/MyDrive/IR dataset/preprocessed_files'
os.makedirs(preprocessed_folder, exist_ok=True)

In [9]:
# Preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove ellipses ("...") and words with apostrophes attached ("'m", "'ve", etc.)
    text = re.sub(r'\.\.\.', '', text)
    text = re.sub(r'\s*\'[a-z]*\s*', ' ', text)


    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

    # Remove blank space tokens
    tokens = [token for token in tokens if token.strip() != '']

    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text



In [None]:
# Function to preprocessing files
def preprocess_all_files(file_paths, preprocessing_function):

    processed_count = 0

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            content_before = file.read()

            if processed_count < 5:
              print(f"File Content Before Preprocessing ({file_path}):")
              print(content_before)

            # Perform preprocessing
            preprocessed_content = preprocessing_function(content_before)

            # Save the preprocessed content to a new file in the preprocessed folder
            output_file_name = os.path.basename(file_path).replace('.txt', '_preprocessed.txt')
            output_path = os.path.join(preprocessed_folder, output_file_name)

            with open(output_path, 'w') as output_file:
                output_file.write(preprocessed_content)

            if processed_count < 5:
              print(f"\nFile Content After Preprocessing ({output_path}):")
              print(preprocessed_content)
              print("\n" + "="*50 + "\n")

            processed_count += 1

# Call the function to preprocessing
preprocess_all_files(file_paths, preprocess_text)

File Content Before Preprocessing (/content/drive/MyDrive/IR dataset/text_files/file890.txt):
My 3rd Joyo Pedal, I'm falling in love with that company, solid great sounding pedals for a fraction of other brands.
Be advised, the effect of this pedal is very subtle... I use it with my mustang V, which already models other amps, this pedal just makes it sound way more realistic and adds some dynamics to your playing.
 Totally worth it for anybody that wants to improve their tone... You have to have a good ear though.
I've read some reviews of people hooking this pedal before their amp input... since this pedal has it's own pre amp, this way of hooking it up will produce some noise.
 Connect it to your FX send return.
I also own the American sound .... both great... depending on my mood I play them both equally... You just can't go wrong with Joyo

File Content After Preprocessing (/content/drive/MyDrive/IR dataset/preprocessed_files/file890_preprocessed.txt):
3rd joyo pedal falling love c

In [10]:
import pickle

In [11]:
# Set paths
original_files_path = '/content/drive/MyDrive/IR dataset/text_files'
preprocessed_files_path = '/content/drive/MyDrive/IR dataset/preprocessed_files'

In [12]:
def create_inverted_index(folder_path):
    inverted_index = {}
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.txt')]

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            content = file.read()
            tokens = word_tokenize(content)

            for token in set(tokens):  # Use set to remove duplicate tokens in a document
                file_name = os.path.basename(file_path).replace('_preprocessed.txt', '.txt')

                if token not in inverted_index:
                    inverted_index[token] = {file_name}
                else:
                    inverted_index[token].add(file_name)

    return inverted_index

In [13]:
# Function to save inverted index using pickle
def save_inverted_index(inverted_index, filename):
    with open(filename, 'wb') as file:
        pickle.dump(inverted_index, file)


In [15]:
# Function to load inverted index using pickle
def load_inverted_index(filename):
    with open(filename, 'rb') as file:
        inverted_index = pickle.load(file)
    return inverted_index

In [14]:
# Create unigram inverted index and save it
inverted_index = create_inverted_index(preprocessed_files_path)
save_inverted_index(inverted_index, '/content/drive/MyDrive/IR dataset/inverted_index.pkl')


KeyboardInterrupt: 

In [16]:
# Load the inverted index
loaded_inverted_index = load_inverted_index('/content/drive/MyDrive/IR dataset/inverted_index.pkl')

In [17]:

# Print the first few entries of the loaded inverted index for verification
for term, postings in list(loaded_inverted_index.items())[:5]:
    print(f"{term}: {postings}")

mat: {'file523.txt', 'file2.txt'}
around: {'file796.txt', 'file244.txt', 'file559.txt', 'file268.txt', 'file174.txt', 'file73.txt', 'file14.txt', 'file825.txt', 'file118.txt', 'file187.txt', 'file578.txt', 'file163.txt', 'file934.txt', 'file686.txt', 'file159.txt', 'file901.txt', 'file957.txt', 'file712.txt', 'file738.txt', 'file790.txt', 'file994.txt', 'file802.txt', 'file196.txt', 'file325.txt', 'file326.txt', 'file986.txt', 'file821.txt', 'file404.txt', 'file5.txt', 'file47.txt', 'file849.txt', 'file31.txt', 'file839.txt', 'file558.txt', 'file911.txt', 'file915.txt', 'file953.txt', 'file886.txt', 'file109.txt', 'file880.txt', 'file541.txt', 'file43.txt', 'file70.txt', 'file170.txt', 'file366.txt', 'file124.txt', 'file100.txt', 'file833.txt', 'file256.txt', 'file2.txt', 'file363.txt', 'file6.txt', 'file248.txt', 'file524.txt', 'file830.txt', 'file294.txt', 'file343.txt', 'file951.txt', 'file679.txt', 'file35.txt', 'file993.txt'}
enough: {'file815.txt', 'file930.txt', 'file302.txt', '

In [18]:
def preprocess_query(query):
    return preprocess_text(query)

In [19]:
def convert_to_boolean_query(query_terms, logical_operators):
    boolean_query = ''

    for i, term in enumerate(query_terms):
        if i > 0:
            boolean_query += f' {logical_operators[i-1]} '

        boolean_query += term

    return boolean_query

In [20]:
# Function to perform AND operation between two sets of document IDs
def perform_and_operation(set1, set2):
    return set1.intersection(set2)

In [21]:
# Function to perform OR operation between two sets of document IDs
def perform_or_operation(set1, set2):
    return set1.union(set2)


In [22]:
# Function to perform AND NOT operation between two sets of document IDs
def perform_and_not_operation(set1, set2):
    return set1.difference(set2)

In [23]:
# Function to perform OR NOT operation between two sets of document IDs
def perform_or_not_operation(set1, set2, all_documents):
    return all_documents.difference(set2).union(set1)

In [33]:
def execute_query(query, inverted_index, all_documents):
    # Split the query into terms and operators
    query_parts = re.findall(r'\b\w+\b|[^\w\s]', query)
# Split the query into terms and operators
    query_parts = re.findall(r'\b\w+\b|[^\w\s]', query)

    # Combine consecutive operator terms into a single entity
    combined_query_parts = []
    current_operator = None
    for part in query_parts:
        if part.isupper():
            # Capitalized term, likely an operator
            if current_operator is None:
                current_operator = part
            else:
                current_operator += ' ' + part
        else:
            # Lowercase term, add current_operator if present
            if current_operator is not None:
                combined_query_parts.append(current_operator)
                current_operator = None
            combined_query_parts.append(part)

    # If an operator is present at the end, add it
    if current_operator is not None:
        combined_query_parts.append(current_operator)

    #print(combined_query_parts)

    # Initialize the result set with all documents
    result_set = set(all_documents)

    operator = None
    temp_result_set = set()  # Temporary set for intermediate results

    for part in combined_query_parts:#['coffee', 'AND', 'brewing', 'OR NOT', 'techniques', 'OR', 'cookbook']
        if part in ['AND', 'OR', 'NOT','AND NOT', 'OR NOT']:
            operator = part
        else:
            # Retrieve the inverted index for the next term
            term = part
            term_set = set(inverted_index.get(term, {}))


            #print(term_set)

            # Perform operations based on the operator
            if operator == 'AND':
                temp_result_set = perform_and_operation(result_set, term_set)
            elif operator == 'OR':
                temp_result_set = perform_or_operation(result_set, term_set)
            elif operator == 'NOT':
                temp_result_set = perform_and_not_operation(result_set, term_set)
            elif operator == 'AND NOT':
                temp_result_set = perform_and_not_operation(result_set, term_set)
            elif operator == 'OR NOT':
                temp_result_set = perform_or_not_operation(result_set, term_set,set(all_documents))
            else:
                 temp_result_set=term_set

            # Update the main result set
            result_set = temp_result_set.copy()
            #print(result_set)

    # Store the results for this query
    result = {
        'query_text': query,
        'num_documents_retrieved': len(result_set),
        'documents_retrieved': list(result_set)
    }

    return result


In [None]:
# Sample queries
# queries = [
#     'They are as good quality wise as much more expensive',#good quality wise much expensive
#     'AND,AND,AND,AND',
#     'Car bag in a canister',
#     'OR, AND NOT'
# ]

In [26]:
# Take user input
N = int(input("Enter the number of queries (N): "))
queries = []

Enter the number of queries (N): 2


In [27]:
for _ in range(N):
    query_sequence = input("Enter the input sequence: ")
    operations = input("Enter operations separated by comma: ")
    queries.extend([query_sequence, operations])

Enter the input sequence: They are as good quality wise as much more expensive
Enter operations separated by comma: AND,AND,AND,AND
Enter the input sequence: Car bag in a canister
Enter operations separated by comma: OR, AND NOT


In [36]:
# Preprocess and execute queries
for i in range(0, len(queries), 2):
    preprocessed_query = preprocess_query(queries[i])

    # Split the preprocessed query into terms
    query_terms = preprocessed_query.split()

    # Split the logical operators
    logical_operators = queries[i + 1].split(',')

    # Convert to boolean query
    boolean_query = convert_to_boolean_query(query_terms, logical_operators)

    #print(f"\nConverted boolean query for query {i // 2 + 1}: {boolean_query}")

     # Get the list of all documents from the inverted index
    all_documents = set()
    for term, document_set in loaded_inverted_index.items():
        all_documents.update(document_set)

    # Execute query
    query_result = execute_query(boolean_query, loaded_inverted_index, all_documents)

    # Print the result
    print(f"Query: {query_result['query_text']}")
    print(f"Number of documents retrieved: {query_result['num_documents_retrieved']}")
    print(f"Names of the documents retrieved: {', '.join(query_result['documents_retrieved'])}\n")

Query: good AND quality AND wise AND much AND expensive
Number of documents retrieved: 1
Names of the documents retrieved: file8.txt

Query: car OR bag  AND NOT canister
Number of documents retrieved: 31
Names of the documents retrieved: file956.txt, file886.txt, file3.txt, file573.txt, file174.txt, file459.txt, file73.txt, file698.txt, file930.txt, file542.txt, file118.txt, file699.txt, file682.txt, file746.txt, file864.txt, file466.txt, file665.txt, file313.txt, file686.txt, file892.txt, file264.txt, file942.txt, file738.txt, file166.txt, file797.txt, file981.txt, file863.txt, file404.txt, file363.txt, file860.txt, file780.txt



In [37]:
# Function to create unigram inverted index
def create_positional_index(folder_path):
    positional_index = {}
    file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.txt')]

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            content = file.read()
            tokens = word_tokenize(content)

            for position, token in enumerate(tokens):
                file_name = os.path.basename(file_path).replace('_preprocessed.txt', '.txt')

                if token not in positional_index:
                    positional_index[token] = {file_name: [position]}
                else:
                    if file_name not in positional_index[token]:
                        positional_index[token][file_name] = [position]
                    else:
                        positional_index[token][file_name].append(position)

    return positional_index


In [38]:
# Function to save inverted index using pickle
def save_positional_index(positional_index, filename):
    with open(filename, 'wb') as file:
        pickle.dump(positional_index, file)


In [39]:
# Function to load inverted index using pickle
def load_positional_index(filename):
    with open(filename, 'rb') as file:
        positional_index = pickle.load(file)
    return positional_index

In [None]:
# Create unigram inverted index and save it
positional_index = create_positional_index(preprocessed_files_path)
save_positional_index(positional_index, '/content/drive/MyDrive/IR dataset/positional_index.pkl')

In [40]:

# Load the inverted index
loaded_positional_index = load_positional_index('/content/drive/MyDrive/IR dataset/positional_index.pkl')

In [41]:

# Print the first few entries of the loaded inverted index for verification
for term, postings in list(loaded_positional_index.items())[:5]:
    print(f"{term}: {postings}")

3rd: {'file890.txt': [0], 'file444.txt': [34], 'file507.txt': [66], 'file130.txt': [33]}
joyo: {'file890.txt': [1, 68], 'file369.txt': [6], 'file342.txt': [65], 'file947.txt': [18], 'file513.txt': [52], 'file836.txt': [6, 28], 'file222.txt': [37]}
pedal: {'file890.txt': [2, 14, 22, 43, 47], 'file470.txt': [19, 26], 'file410.txt': [21], 'file390.txt': [55, 66, 72, 77, 91, 100], 'file453.txt': [27, 41], 'file47.txt': [6], 'file369.txt': [1], 'file305.txt': [52, 56, 70], 'file665.txt': [75, 102], 'file342.txt': [4, 24, 33, 47, 57], 'file668.txt': [0, 16], 'file521.txt': [41], 'file865.txt': [1, 25], 'file333.txt': [4, 6, 8], 'file325.txt': [3, 57, 71], 'file19.txt': [39, 48], 'file429.txt': [16], 'file947.txt': [13, 34], 'file684.txt': [0, 17], 'file610.txt': [23], 'file79.txt': [8, 22, 27, 38, 67], 'file883.txt': [36, 46], 'file258.txt': [9, 20], 'file629.txt': [3, 24, 40], 'file423.txt': [2], 'file941.txt': [27], 'file614.txt': [17, 21], 'file12.txt': [22], 'file968.txt': [2, 29], 'file

In [90]:
def execute_positional_query(query, positional_index):
    # Split the query into terms
    query_terms = re.findall(r'\b\w+\b', query)

    # Initialize the result set with all documents containing the first term
    first_term = query_terms[0]
    result_set = set(positional_index.get(first_term, {}).keys())

    # Iterate through the rest of the terms
    for term in query_terms[1:]:
        term_positions = positional_index.get(term, {})
        #print(term_positions)

        # Ensure the documents in the result set have the correct positional relationship
        result_set_copy = result_set.copy()  # Create a copy before iterating
        for doc in result_set_copy:
            if doc not in term_positions:
                result_set.discard(doc)
            else:
                positions_current = term_positions[doc]
                positions_previous = positional_index.get(query_terms[query_terms.index(term) - 1], {}).get(doc, [])

                # Check if there is a position in current term that is greater than positions of the previous term
                if not any(pos_current - pos_previous == 1 for pos_current in positions_current for pos_previous in positions_previous):
                    result_set.discard(doc)

    # Store the results for this query
    result = {
        'query_text': query,
        'num_documents_retrieved': len(result_set),
        'documents_retrieved': list(result_set) if result_set is not None else []
    }

    return result


In [98]:
# Take input from user
num_queries = int(input("Enter the number of queries: "))
queries = []

Enter the number of queries: 2


In [99]:
for _ in range(num_queries):
    query = input("Enter a phrase query: ")
    queries.append(query)

Enter a phrase query: its more on toy side than on instrument side, and made in Indonesia.
Enter a phrase query: They are as good quality wise as much more expensive


In [100]:
# Preprocess the queries
preprocessed_queries = [preprocess_query(q) for q in queries]
print(preprocessed_queries)

['toy side instrument side made indonesia', 'good quality wise much expensive']


In [91]:
# # Sample queries
# queries = [
#     'its more on toy side than on instrument side, and made in Indonesia.',
#'They are as good quality wise as much more expensive'
#     #'great price good quality'
#]

In [101]:
# Execute queries
for i, query in enumerate(preprocessed_queries):
    query_result = execute_positional_query(query, loaded_positional_index)
    print(f"Number of documents retrieved for query {i + 1} using positional index: {query_result['num_documents_retrieved']}")
    print(f"Names of documents retrieved for query {i + 1} using positional index: {', '.join(query_result['documents_retrieved'])}")

Number of documents retrieved for query 1 using positional index: 1
Names of documents retrieved for query 1 using positional index: file6.txt
Number of documents retrieved for query 2 using positional index: 1
Names of documents retrieved for query 2 using positional index: file8.txt
