<a href="https://colab.research.google.com/github/MohitKhetan10/Vector-Space-Model-with-TF-IDF-for-Document-Retrieval/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Welcome to Colab!

In [1]:
# ============================
import numpy as np
import math
import nltk
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (only needed once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
from google.colab import files

# Upload multiple files (e.g., your dataset text files)
uploaded = files.upload()

Saving document_9.txt to document_9.txt
Saving document_8.txt to document_8.txt
Saving document_7.txt to document_7.txt
Saving document_6.txt to document_6.txt
Saving document_5.txt to document_5.txt
Saving document_4.txt to document_4.txt
Saving document_3.txt to document_3.txt
Saving document_2.txt to document_2.txt
Saving document_10.txt to document_10.txt
Saving document_1.txt to document_1.txt


In [3]:
docs = []
for filename in uploaded.keys():
    with open(filename, 'r', encoding='utf-8') as file:
        docs.append(file.read())

print(f"Loaded {len(docs)} documents.")

Loaded 10 documents.


In [6]:
queries = [
    "Deep Learning",
    "Data mining",
    "Machine learning",
    "Computer vision",
    "Artificial Intelligence",
    "Human"
]

# ============================
# 4. Text Preprocessing
# ============================
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove special chars
    text = re.sub(r"\d+", "", text)             # remove digits
    return text.split()

tokenized_docs = [tokenize(doc) for doc in docs]
tokenized_queries = [tokenize(query) for query in queries]

# Build vocabulary
vocab = list(set(term for doc in tokenized_docs for term in doc))

print("Vocabulary size:", len(vocab))

Vocabulary size: 231


In [8]:
def term_frequency(term, document):
    return document.count(term) / len(document) if len(document) > 0 else 0

# ============================
# 6. Inverse Document Frequency (IDF)
# ============================
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

# ============================
# 7. Compute TF-IDF
# ============================
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

# ============================
# 8. Cosine Similarity
# ============================
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2) if norm_vec1 and norm_vec2 else 0

# ============================
# 9. Calculate TF-IDF Vectors
# ============================
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

# ============================
# 10. Results & Ranking
# ============================
# Save results into a text file
with open("result_Week3.txt", "w") as result_file:
    for i, query_vector in enumerate(query_tfidf_vectors):
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
        ranked_docs = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)[:3]

        result_str = f"\nTop 3 results for query '{queries[i]}':\n"
        for rank, (doc_index, score) in enumerate(ranked_docs, 1):
            result_str += f"Rank {rank}: Document {doc_index + 1} with score {score:.4f}\n"

        print(result_str)
        result_file.write(result_str)

# ============================
# 11. Download Results
# ============================
files.download("result_Week3.txt")


Top 3 results for query 'Deep Learning':
Rank 1: Document 9 with score 0.2588
Rank 2: Document 8 with score 0.0956
Rank 3: Document 7 with score 0.0852


Top 3 results for query 'Data mining':
Rank 1: Document 1 with score 0.6618
Rank 2: Document 4 with score 0.0366
Rank 3: Document 2 with score 0.0327


Top 3 results for query 'Machine learning':
Rank 1: Document 10 with score 0.1791
Rank 2: Document 4 with score 0.1321
Rank 3: Document 9 with score 0.1158


Top 3 results for query 'Computer vision':
Rank 1: Document 6 with score 0.4380
Rank 2: Document 7 with score 0.0652
Rank 3: Document 1 with score 0.0000


Top 3 results for query 'Artificial Intelligence':
Rank 1: Document 8 with score 0.5767
Rank 2: Document 1 with score 0.0000
Rank 3: Document 2 with score 0.0000


Top 3 results for query 'Human':
Rank 1: Document 8 with score 0.1220
Rank 2: Document 7 with score 0.1088
Rank 3: Document 1 with score 0.0000



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>