Importing Libraries


In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import os
import string
import logging
import re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Define stopwords and lemmatizer
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

print("Libraries imported and resources downloaded successfully.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Libraries imported and resources downloaded successfully.


Loading Datasets

In [3]:
# Load the dataset (documents) from the text files
documents = []

# Generate file paths and open each document using a for loop
for i in range(1, 6):
    file_path = f'/content/review {i}.txt'
    with open(file_path, 'r') as file:
        documents.append(file.read())

# Print each document to verify it is loaded correctly
for i, doc in enumerate(documents, 1):
    print(f"Document {i}: {doc[:200]}...\n")  # Displaying only first 200 characters


Document 1: Queen Anne was the first ruler of the newly united Great Britain in the early 1700s; the last Stuart monarch before the Hanoverian dynasty that’s still with us (God save the Queen!). So, there’s your ...

Document 2: Ethan Hunt and his slightly creaky IMF crew are on the trail of some plutonium before it finds its way into the hands of terrorists called The Apostles. They’re an offshoot of a group formed by Ethan’...

Document 3: Since being hit by a mystic meteorite in ancient times, the African kingdom of Wakanda has developed a high-tech civilisation while maintaining traditional ways (spears, dancing, colourful clothing, s...

Document 4: 
In a galaxy far, far away, for the past 40 years, the Rebel Alliance has kept destroying the evil Empire’s First Order’s big toys. But the Supreme Leader is still convinced a crushing victory is just...

Document 5: The Perron family move into a large and run-down house – a deceased estate – in 1971 and discover that the deceased owne

Text Cleaning

In [5]:
# Function to clean and preprocess text (lowercase, tokenization, stopwords removal, and lemmatization)
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove non-alphabetic characters using regular expression
    text = re.sub(r'\W+', ' ', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize tokens
    tokens = [LEMMATIZER.lemmatize(token) for token in tokens if token not in STOPWORDS]

    return tokens

# Apply the cleaning function to each document in the dataset
cleaned_documents = [clean_text(doc) for doc in documents]

# Print the cleaned documents to verify
for i, doc in enumerate(cleaned_documents, 1):
    print(f"Cleaned Document {i}: {doc[:20]}...\n")  # Displaying only the first 20 tokens


Cleaned Document 1: ['queen', 'anne', 'first', 'ruler', 'newly', 'united', 'great', 'britain', 'early', '1700s', 'last', 'stuart', 'monarch', 'hanoverian', 'dynasty', 'still', 'u', 'god', 'save', 'queen']...

Cleaned Document 2: ['ethan', 'hunt', 'slightly', 'creaky', 'imf', 'crew', 'trail', 'plutonium', 'find', 'way', 'hand', 'terrorist', 'called', 'apostle', 'offshoot', 'group', 'formed', 'ethan', 'old', 'adversary']...

Cleaned Document 3: ['since', 'hit', 'mystic', 'meteorite', 'ancient', 'time', 'african', 'kingdom', 'wakanda', 'developed', 'high', 'tech', 'civilisation', 'maintaining', 'traditional', 'way', 'spear', 'dancing', 'colourful', 'clothing']...

Cleaned Document 4: ['galaxy', 'far', 'far', 'away', 'past', '40', 'year', 'rebel', 'alliance', 'kept', 'destroying', 'evil', 'empire', 'first', 'order', 'big', 'toy', 'supreme', 'leader', 'still']...

Cleaned Document 5: ['perron', 'family', 'move', 'large', 'run', 'house', 'deceased', 'estate', '1971', 'discover', 'deceased', 

NLTK Setup and Preprocessing

In [6]:
# Function to preprocess each document
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords and lemmatize the words
    words = [LEMMATIZER.lemmatize(word) for word in words if word not in STOPWORDS and word.isalpha()]

    return words

# Apply preprocessing to each document in the collection
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Display preprocessed documents to check results
for i, doc in enumerate(preprocessed_documents, 1):
    print(f"Preprocessed Document {i}: {doc}\n")


Preprocessed Document 1: ['queen', 'anne', 'first', 'ruler', 'newly', 'united', 'great', 'britain', 'early', 'last', 'stuart', 'monarch', 'hanoverian', 'dynasty', 'still', 'u', 'god', 'save', 'queen', 'history', 'widowed', 'childless', 'semiinvalid', 'anne', 'lonely', 'eccentric', 'figure', 'stuck', 'gilded', 'cage', 'palace', 'leaf', 'managing', 'national', 'affair', 'childhood', 'friend', 'formidable', 'sarah', 'churchill', 'sarah', 'wife', 'john', 'churchill', 'head', 'armed', 'force', 'another', 'dynasty', 'still', 'u', 'duke', 'marlborough', 'john', 'also', 'known', 'fighting', 'french', 'comme', 'de', 'tradition', 'one', 'sarah', 'job', 'bully', 'parliament', 'keeping', 'war', 'funded', 'sarah', 'pretty', 'much', 'charge', 'arrival', 'impoverished', 'cousin', 'abigail', 'seek', 'work', 'servant', 'abigail', 'set', 'working', 'way', 'hierarchy', 'way', 'queen', 'bedchamber', 'witty', 'bawdy', 'occasionally', 'absurdist', 'story', 'court', 'politics', 'great', 'fun', 'despite', 'gr

Inverted Index Construction

In [7]:
# Function to construct the inverted index
def build_inverted_index(docs):
    inverted_index = defaultdict(list)

    # Iterate over each document
    for doc_id, doc in enumerate(docs):
        # Count word frequency in the document
        word_freq = Counter(doc)

        # Add each word to the inverted index with its document ID
        for word in word_freq:
            inverted_index[word].append(doc_id)

    return inverted_index

# Build the inverted index from the preprocessed documents
inverted_index = build_inverted_index(preprocessed_documents)

# Display the inverted index
print("Inverted Index:")
for word, doc_ids in inverted_index.items():
    print(f"{word}: {doc_ids}")


Inverted Index:
queen: [0]
anne: [0]
first: [0, 2, 3]
ruler: [0]
newly: [0]
united: [0]
great: [0]
britain: [0]
early: [0, 1]
last: [0, 1, 3]
stuart: [0]
monarch: [0]
hanoverian: [0]
dynasty: [0]
still: [0, 3]
u: [0]
god: [0]
save: [0]
history: [0, 1]
widowed: [0]
childless: [0]
semiinvalid: [0]
lonely: [0]
eccentric: [0]
figure: [0]
stuck: [0]
gilded: [0]
cage: [0]
palace: [0]
leaf: [0]
managing: [0]
national: [0]
affair: [0]
childhood: [0]
friend: [0]
formidable: [0]
sarah: [0]
churchill: [0]
wife: [0]
john: [0]
head: [0]
armed: [0]
force: [0, 3]
another: [0]
duke: [0]
marlborough: [0]
also: [0, 2]
known: [0]
fighting: [0]
french: [0]
comme: [0]
de: [0]
tradition: [0]
one: [0, 1]
job: [0]
bully: [0]
parliament: [0]
keeping: [0, 3]
war: [0, 3]
funded: [0]
pretty: [0]
much: [0]
charge: [0]
arrival: [0]
impoverished: [0, 2]
cousin: [0]
abigail: [0]
seek: [0]
work: [0]
servant: [0]
set: [0]
working: [0]
way: [0, 1, 2]
hierarchy: [0]
bedchamber: [0]
witty: [0, 3]
bawdy: [0]
occasionally: 

Boolean Query Operation (AND Operation)

In [8]:
# Assuming 'documents' is a dictionary with filenames as keys and preprocessed text as values
# Initialize all_documents with the set of all document IDs
all_documents = set(range(len(documents)))  # If your documents are indexed as 0, 1, 2, ...

# Function for 'AND' query (finds common documents for all terms)
def and_query(terms, inverted_index):
    # Get the initial set of documents that contain the first term
    result = set(inverted_index.get(terms[0], set()))

    # Perform intersection with the document sets for the remaining terms
    for term in terms[1:]:
        result &= set(inverted_index.get(term, set()))  # Intersection with the next term's documents

    return result

# Example usage:
query = "king war"  # Example query
query_terms = preprocess_text(query)  # Preprocess query to get terms

# Perform AND query using the terms
result_docs = and_query(query_terms, inverted_index)

# Display the results
print(f"Documents matching '{query}' using AND operation: {result_docs}")


Documents matching 'king war' using AND operation: set()


Convert doc_ids to File Names

In [14]:
# Mapping document IDs to their respective file names
doc_id_to_filename = {
    0: 'review1.txt',
    1: 'review2.txt',
    2: 'review3.txt',
    3: 'review4.txt',
    4: 'review5.txt'
}

# Function to convert doc_ids to file names
def convert_doc_ids_to_filenames(doc_ids):
    return [doc_id_to_filename[doc_id] for doc_id in doc_ids]

# Example: Convert doc_ids from an AND query to file names
query_terms = ['king', 'war']  # Example terms for AND query
result_docs = and_query(query_terms, inverted_index)
result_filenames = convert_doc_ids_to_filenames(result_docs)

# Display the result
print(f"Documents matching AND query {query_terms}: {result_filenames}")


Documents matching AND query ['king', 'war']: []


Main Function

In [15]:
# Main function to run the entire process
def main():

    # Step 2: Preprocess the documents
    cleaned_documents = [preprocess_text(doc) for doc in documents]

    # Step 3: Build the inverted index
    inverted_index = build_inverted_index(cleaned_documents)

    # Step 4: Perform AND query processing
    query_terms = ['king', 'war']  # Example terms for AND query
    result_docs = and_query(query_terms, inverted_index)
    result_filenames = convert_doc_ids_to_filenames(result_docs)
    print(f"Documents matching AND query {query_terms}: {result_filenames}")

# Call the main function to run everything
if __name__ == "__main__":
    main()


Documents matching AND query ['king', 'war']: []


In [21]:
# Function to write query results to a file, including the author name
def write_results_to_file(results, output_file):
    with open(output_file, 'w') as file:
        # Write the name at the top of the file
        file.write("Reviews by Lasta\n\n")

        # Write the query results
        for result in results:
            file.write(result + '\n')

# Example: Process the query and write results to a file
def process_query(query, inverted_index, documents):
    # Split the query into terms
    query_terms = query.split()

    # Perform the Boolean AND query
    query_results_docs = boolean_and_query(query_terms, inverted_index)

    # Convert document IDs to filenames
    query_results_filenames = convert_doc_ids_to_filenames(query_results_docs)

    # Write the results to a file
    write_results_to_file(query_results_filenames, 'query_results.txt')

    return query_results_filenames


Reverse Dictionary

In [16]:
import os
from collections import defaultdict

# Specify the directory containing the text files
directory = '/content/'

# Step 1: Load the dataset (documents) from the text files in the directory
documents = []
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r') as file:
            documents.append(file.read())

# Step 2: Preprocess the text (tokenization and cleaning)
def preprocess_text(text):
    # Convert to lowercase, remove special characters, tokenize, etc.
    tokens = text.lower().split()
    return tokens

# Step 3: Build the inverted index and reverse dictionary
inverted_index = defaultdict(set)
reverse_dictionary = defaultdict(set)

for doc_id, doc in enumerate(documents):
    tokens = preprocess_text(doc)
    for token in tokens:
        inverted_index[token].add(doc_id)
        reverse_dictionary[doc_id].add(token)

# Step 4: Print the inverted index (term to document mapping)
print("Inverted Index:")
for term, doc_ids in inverted_index.items():
    print(f"{term}: {doc_ids}")

# Step 5: Print the reverse dictionary (document to term mapping)
print("\nReverse Dictionary:")
for doc_id, terms in reverse_dictionary.items():
    print(f"Document {doc_id}: {terms}")


Inverted Index:
the: {0, 1, 2, 3, 4}
perron: {0}
family: {0}
move: {0}
into: {0, 1, 2, 3}
a: {0, 1, 2, 3, 4}
large: {0}
and: {0, 1, 2, 3, 4}
run-down: {0}
house: {0}
–: {0, 2}
deceased: {0}
estate: {0}
in: {0, 1, 2, 3, 4}
1971: {0}
discover: {0}
that: {0, 1, 2, 4}
owners: {0}
seem: {0}
not: {0, 2, 4}
to: {0, 1, 2, 3, 4}
have: {0}
moved: {0}
out: {0, 3}
yet.: {0}
clocks: {0}
stop,: {0}
doors: {0}
creak,: {0}
things: {0, 4}
go: {0}
bump: {0}
night,: {0}
various: {0}
perrons: {0}
get: {0}
thumped: {0}
etc.: {0}
tired: {0}
of: {0, 1, 2, 3, 4}
being: {0, 4}
demonised,: {0}
they: {0}
call: {0}
demonologists: {0}
ed: {0}
lorraine: {0}
warren.: {0}
usual: {0, 3}
haunted-house: {0}
shenanigans: {0}
ensue.: {0}
this: {0, 1, 2, 3, 4}
is: {0, 1, 2, 3, 4}
supposedly: {0}
based: {0}
on: {0, 2, 3}
true: {0}
story,: {0, 4}
which: {0}
possibly: {0}
explains: {0}
why: {0, 2}
don’t: {0}
too: {0, 4}
extreme.: {0}
fact: {0}
it’s: {0, 2, 3, 4}
bit: {0}
slow: {0}
at: {0, 3}
times,: {0, 4}
script: {0}
somewha