In [26]:
import os
import math
import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords



In [27]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

In [29]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = word_tokenize(text)
    return [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]

def term_frequency(term, document):
    return document.count(term) / len(document)

def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (num_docs_containing_term)) if num_docs_containing_term > 0 else 0

In [30]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2) if norm_vec1 > 0 and norm_vec2 > 0 else 0

In [31]:
def load_txt_files(folder_path):
    documents = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                documents.append(clean_text(content))
                filenames.append(filename)
    return documents, filenames

def calculate_precision_at_k(relevant_docs, ranked_docs, k):
    if not ranked_docs:
        return 0.0
    top_k_docs = ranked_docs[:k]
    relevant_retrieved = sum(1 for doc in top_k_docs if doc[0] in relevant_docs)
    return relevant_retrieved / k

In [32]:
queries = [
    "United Legacy",
    "Red Devils' Glory",
    "Theatre of Dreams",
    "Champions United",
    "Glory Days",
    "Manchester Triumph",
    "United Legends",
    "Old Trafford Heroes",
    "United's Rise",
    "Legacy of United"
]


In [33]:
def main():
    folder_path = r'C:\Users\Asus\Desktop\JN\VSM'
    documents, filenames = load_txt_files(folder_path)
    for query in queries:
        cleaned_query = clean_text(query)
        vocab = sorted(set(word for doc in documents + [cleaned_query] for word in doc))
        query_vector = compute_tfidf(cleaned_query, documents, vocab)
        doc_vectors = [compute_tfidf(doc, documents, vocab) for doc in documents]
        similarities = [(filenames[i], cosine_similarity(query_vector, doc_vector)) for i, doc_vector in enumerate(doc_vectors)]
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
        k = 5
        relevant_docs = set(filenames)
        precision_at_k = calculate_precision_at_k(relevant_docs, similarities, k)
        with open("Rohan_Result.txt", "a") as f:
            f.write(f"Results for query: '{query}'\n")
            f.write(f"Precision@{k}: {precision_at_k:.4f}\n")
            for title, similarity in similarities[:k]:
                f.write(f"Document: {title}, Similarity: {similarity:.4f}\n")
            f.write("\n")
    print("Results written to 'Rohan_Result.txt'.")
    with open('Rohan_Result.txt', 'r') as f:
        content = f.readlines()
        for line in content:
            if line.startswith("Document:"):
                parts = line.split(", Similarity:")
                filename = parts[0][10:]
                similarity_score = parts[1].strip()
                if len(filename) > 48:
                    formatted_filename = f"{filename[:25]}...{filename[-25:]}, Similarity: {similarity_score}"
                else:
                    formatted_filename = line.strip()
                print(formatted_filename)
            else:
                print(line.strip())

if __name__ == "__main__":
    main()

Results written to 'Rohan_Result.txt'.
Results for query: 'United Legacy'
Precision@5: 1.0000
Document: Foundations of Manchester United (1878-1902).txt, Similarity: 0.0242
Document: Dominance in the 2000s.txt, Similarity: 0.0101
Document: The Early Years.txt, Similarity: 0.0097
Document: The 1970 Struggles and Transition.txt, Similarity: 0.0096
Document: Post-Ferguson Era (2013-2020).txt, Similarity: 0.0095

Results for query: 'Red Devils' Glory'
Precision@5: 1.0000
Document: The Rebuilding Phase (2020-2022).txt, Similarity: 0.1953
Document: Dominance in the 2000s.txt, Similarity: 0.0000
Document: Foundations of Manchester United (1878-1902).txt, Similarity: 0.0000
Document: Post-Ferguson Era (2013-2020).txt, Similarity: 0.0000
Rebuilding and the First ...n Triumph (1958-1968).txt, Similarity: 0.0000

Results for query: 'Theatre of Dreams'
Precision@5: 1.0000
Document: Dominance in the 2000s.txt, Similarity: 0.0000
Document: Foundations of Manchester United (1878-1902).txt, Similarity