In [15]:
import zipfile
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from math import log, sqrt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK resources if not already available
nltk.download('punkt')
nltk.download('stopwords')

# Preprocess function with stopword removal
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove punctuation and stopwords
    return tokens

def read_documents(zip_path):
    documents = {}
    with zipfile.ZipFile(zip_path, 'r') as z:
        for filename in z.namelist():
            with z.open(filename) as f:
                documents[filename] = preprocess(f.read().decode('utf-8'))
    return documents

def compute_tf_idf(query, documents):
    N = len(documents)
    df = defaultdict(int)

    # Calculate df for each term in documents
    for doc in documents.values():
        unique_terms = set(doc)
        for term in unique_terms:
            df[term] += 1

    # Compute TF-IDF for the query
    query_weights = {}
    for term in query:
        tf = 1 + log(query.count(term)) if query.count(term) > 0 else 0
        idf = log(N / df[term]) if df[term] > 0 else 0
        query_weights[term] = tf * idf

    # Normalize the query vector
    query_magnitude = sqrt(sum(weight ** 2 for weight in query_weights.values()))
    query_weights_normalized = {term: weight / query_magnitude for term, weight in query_weights.items() if weight > 0}

    # Calculate TF for each document and cosine similarity
    scores = []
    for doc_name, doc in documents.items():
        doc_tf = defaultdict(float)
        for term in doc:
            tf = 1 + log(doc.count(term)) if doc.count(term) > 0 else 0
            doc_tf[term] += tf

        # Normalize document
        doc_magnitude = sqrt(sum(weight ** 2 for weight in doc_tf.values()))
        if doc_magnitude > 0:
            normalized_doc_tf = {term: weight / doc_magnitude for term, weight in doc_tf.items()}
        else:
            normalized_doc_tf = {}

        # Compute cosine similarity
        similarity = sum(query_weights_normalized.get(term, 0) * normalized_doc_tf.get(term, 0) for term in query_weights_normalized)
        scores.append((doc_name, similarity))

    # Rank documents based on scores
    ranked_documents = sorted(scores, key=lambda x: x[1], reverse=True)
    return ranked_documents[:10]  # Top 10 documents


zip_path = 'Corpus.zip'
documents = read_documents(zip_path)

# User query input
user_query = input("Enter your query: ")
query_tokens = preprocess(user_query)

# Compute scores and get top documents
top_documents = compute_tf_idf(query_tokens, documents)

# Print results
print("Top 10 documents:")
for doc, score in top_documents:
    print(f"{doc}: {score:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter your query: ‘Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
Top 10 documents:
zomato.txt: 0.5903
swiggy.txt: 0.1695
instagram.txt: 0.1083
flipkart.txt: 0.0516
messenger.txt: 0.0460
Amazon.txt: 0.0330
reddit.txt: 0.0328
paypal.txt: 0.0266
Discord.txt: 0.0240
nike.txt: 0.0239
