Improting Libraries


In [2]:
import os
import math
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

Loading Documents

In [3]:
# Load documents from the provided review files
def load_documents():
    documents = []
    for i in range(1, 11):  # Looping through review1.txt to review10.txt
        file_path = f'/content/review {i}.txt'  # Dynamically generate file paths
        with open(file_path, 'r') as file:
            documents.append(file.read().strip())  # Read and clean the file content
    return documents

Tokenization

In [4]:
# Tokenization function to preprocess documents and queries: lowercase and split into words
def tokenize(text):
    return text.lower().split()


In [5]:
# Load the documents and tokenize them
docs = load_documents()
print("Documents Loaded:", docs)  # Debugging print to check loaded documents
tokenized_docs = [tokenize(doc) for doc in docs]
print("Tokenized Documents:", tokenized_docs)  # Debugging print to check tokenized documents

Documents Loaded: ['Queen Anne was the first ruler of the newly united Great Britain in the early 1700s; the last Stuart monarch before the Hanoverian dynasty that’s still with us (God save the Queen!). So, there’s your history.\nHere, the widowed, childless, semi-invalid Anne is a lonely, eccentric figure, stuck in the gilded cage of her palace, who leaves the managing of national affairs to her childhood friend, the formidable Sarah Churchill.\nSarah is the wife of John Churchill, head of the armed forces (and of another dynasty that’s still with us). The Duke of Marlborough, as John is also known, is off fighting the French, comme de tradition, and one of Sarah’s jobs is to bully parliament into keeping his war funded.\nSarah, then, is pretty much in charge until the arrival of a impoverished cousin, Abigail, who seeks work as a servant. Abigail then sets about working her way up the hierarchy, all the way to the Queen’s bedchamber.\nThis witty, bawdy and occasionally absurdist stor

Text Preprocessing and Queries

In [6]:
# Download NLTK resources (only need to run once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenization and Lemmatization Function
def tokenize_and_lemmatize(text):
    tokens = text.lower().split()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Queries
queries = ["queen war", "black panther", "superhero child"]
tokenized_queries = [tokenize(query) for query in queries]
print("Tokenized Queries:", tokenized_queries)  # Debugging print to check tokenized queries

# Preprocess documents and queries: lowercase, tokenize, and lemmatize
tokenized_docs = [tokenize_and_lemmatize(doc) for doc in docs]
tokenized_queries = [tokenize_and_lemmatize(query) for query in queries]



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Tokenized Queries: [['queen', 'war'], ['black', 'panther'], ['superhero', 'child']]


Vocabulary Creation

In [8]:
# Build vocabulary (unique words across all documents)
vocab = sorted(set([word for doc in tokenized_docs for word in doc]))
print("Vocabulary:", vocab)  # Debugging print to check vocabulary

Vocabulary: ['(aka', '(and', '(at', '(bradley', '(despite', '(god', '(he', '(i', '(in', '(juno).', '(ok,', '(spears,', '(we', '(yes,', '(you’ll', '1700s;', '1971', '2013', '40', '6/10', '7/10', '8/10', '9/11.', 'a', 'abigail', 'abigail,', 'able', 'about', 'above', 'absurdist', 'abuse', 'accurate', 'action', 'actually,', 'adaptation', 'adapting', 'added', 'adversary', 'affair', 'affinity', 'african', 'after', 'agrees', 'all', 'all.', 'alliance', 'already', 'also', 'amazon', 'american', 'an', 'ancient', 'and', 'anne', 'annoying', 'another', 'antagonist', 'any', 'anywhere.)', 'apostles.', 'apparently', 'are', 'ares,', 'armed', 'armistice.', 'around', 'arrival', 'arrives', 'arse', 'as,', 'asks', 'at', 'attention', 'attention.', 'average', 'away,', 'baby', 'back', 'backlog', 'bad', 'based', 'batman', 'bawdy', 'be', 'beautiful', 'bedchamber.', 'before', 'began.', 'being', 'benefit', 'benefits)', 'best', 'between', 'big', 'biography.', 'bit', 'black', 'blame,”', 'blame.”', 'blast', 'blood?', 

Term Frequency

In [9]:
# Term Frequency (TF) function
def term_frequency(term, document):
    return document.count(term) / len(document)

In [10]:
# Inverse Document Frequency (IDF) function
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

Compute TF-IDF

In [11]:
# Compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

Cosine Similarity

In [12]:
# Cosine similarity function
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

TF-IDF Vectors for Documents and Queries

In [13]:
# Calculate TF-IDF vectors for documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
print("Document TF-IDF Vectors:", doc_tfidf_vectors)  # Debugging print to check TF-IDF vectors for documents

query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]
print("Query TF-IDF Vectors:", query_tfidf_vectors)  # Debugging print to check TF-IDF vectors for queries



Document TF-IDF Vectors: [array([ 0.        ,  0.0035546 ,  0.00825353,  0.        ,  0.00825353,
        0.00825353,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.00825353,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.00244385,  0.00825353,
        0.00825353,  0.        ,  0.0018291 ,  0.        ,  0.00825353,
        0.        ,  0.00825353,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.00825353,  0.        ,
        0.        ,  0.        ,  0.        ,  0.00469893,  0.        ,
        0.        ,  0.        ,  0.00261962,  0.        ,  0.        ,
        0.        ,  0.        , -0.00195508,  0.01650706,  0.00825353,
        0.00825353,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.00825353,  0.        ,
        0.        ,  0.00825353,  0.  

Cosine Similarities

In [14]:
# Calculate cosine similarities between queries and documents
cosine_similarities = []
for query_vector in query_tfidf_vectors:
    similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
    cosine_similarities.append(similarities)
print("Cosine Similarities:", cosine_similarities)  # Debugging print to check cosine similarities

Cosine Similarities: [[0.10317165868935685, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03125103789690823, 0.03125103789690823, 0.0], [0.0, 0.0, 0.08229576901790173, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.049715119392604534, 0.049715119392604534, 0.08320633037072016]]


In [21]:
# Save the results to a text file in descending order as result_lasta.txt
with open('result_lasta.txt', 'w') as result_file:
    for i, query in enumerate(queries):
        result_file.write(f"\nCosine similarities for query '{query}':\n")

        # Create a list of (document_index, similarity_score) tuples
        doc_similarity_pairs = [(j + 1, cosine_similarities[i][j]) for j in range(len(docs))]

        # Sort the list by similarity scores in descending order
        doc_similarity_pairs.sort(key=lambda x: x[1], reverse=True)

        # Write the sorted results to the file
        for doc_index, similarity_score in doc_similarity_pairs:
            result_file.write(f"Document {doc_index}: {similarity_score:.4f}\n")



In [20]:
!cat result_lasta.txt


Cosine similarities for query 'queen war':
Document 1: 0.1032
Document 8: 0.0313
Document 9: 0.0313
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.0000
Document 5: 0.0000
Document 6: 0.0000
Document 7: 0.0000
Document 10: 0.0000

Cosine similarities for query 'black panther':
Document 3: 0.0823
Document 1: 0.0000
Document 2: 0.0000
Document 4: 0.0000
Document 5: 0.0000
Document 6: 0.0000
Document 7: 0.0000
Document 8: 0.0000
Document 9: 0.0000
Document 10: 0.0000

Cosine similarities for query 'superhero child':
Document 10: 0.0832
Document 8: 0.0497
Document 9: 0.0497
Document 1: 0.0000
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.0000
Document 5: 0.0000
Document 6: 0.0000
Document 7: 0.0000
