In [1]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to C:\Users\Kritika
[nltk_data]     Panta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Kritika
[nltk_data]     Panta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Kritika
[nltk_data]     Panta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Kritika
[nltk_data]     Panta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import os, re, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

In [3]:
# Load text files 
def load_text_files(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0
    print(f"Scanning folder: {folder_path}")
    for filename in os.listdir(folder_path):
        if filename.lower() == "queries.txt":
            continue
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                data[doc_id] = content
                doc_id_to_filename[doc_id] = filename
                print(f"Loaded doc_id {doc_id} -> {filename}")
                doc_id += 1

    print(f"Total documents loaded: {len(data)}")
    return data, doc_id_to_filename

In [4]:
# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)

    cleaned_tokens = [
        LEMMATIZER.lemmatize(word)
        for word in tokens
        if word not in STOPWORDS and len(word) > 1
    ]
    return " ".join(cleaned_tokens)

In [5]:
# Generate queries file
def generate_queries_file(output_file):

    queries = [
        "social crimes in Nepal",
        "wildlife attacks on highways",
        "rise in suicide cases",
        "elephant attacks and highway safety",
        "drug abuse and narcotics cases",
        "Apple M5 AI performance",
        "police awareness programs",
        "increase in criminal cases in Kathmandu",
        "ray tracing and graphics in M5",
        "rise in bank fraud incidents",
        "attempt to murder cases Nepal",
        "rape and sexual violence statistics",
        "polygamy crime cases in Nepal",
        "human wildlife conflict in Nepal",
        "forensic investigation Nepal",
        "road accident deaths in Nepal"
    ]

    with open(output_file, "w", encoding="utf-8") as f:
        for q in queries:
            f.write(q + "\n")

    print(f"\nGenerated queries.txt at: {output_file}")
    return queries

In [6]:
# Load Queries From File
def load_queries(query_file):
    queries = []
    print(f"\nLoading queries from: {query_file}")

    with open(query_file, "r", encoding="utf-8") as f:
        for line in f:
            q = line.strip()
            if q:
                queries.append(q)
                print(f"Loaded query: {q}")

    return queries

In [7]:
# Compute TF-IDF + Cosine Similarity
def compute_similarity(doc_texts, queries, doc_id_to_filename, output_file="similarity_results.txt"):
    print("\nComputing TF-IDF vectors...")

    vectorizer = TfidfVectorizer()

    doc_vectors = vectorizer.fit_transform(doc_texts)

    with open(output_file, "w", encoding="utf-8") as result_file:

        for query in queries:
            cleaned_query = clean_text(query)
            query_vector = vectorizer.transform([cleaned_query])
            scores = cosine_similarity(query_vector, doc_vectors)[0]
            ranking = sorted(
                list(enumerate(scores)),
                key=lambda x: x[1],
                reverse=True
            )
            print(f"\nQuery: {query}")
            result_file.write(f"\nQuery: {query}\n")
            result_file.write("="*60 + "\n")

            for rank, (doc_id, score) in enumerate(ranking, start=1):
                filename = doc_id_to_filename[doc_id]
                output_line = f"Rank {rank}: {filename}  |  Similarity = {score:.4f}"
                print(output_line)
                result_file.write(output_line + "\n")

            result_file.write("\n")

    print(f"\nSimilarity results saved to: {output_file}")

In [8]:
# Main Program
def main():

    folder_path = r"C:\Users\Kritika Panta\Desktop\Week three assignment"
    query_file = os.path.join(folder_path, "queries.txt")

    generate_queries_file(query_file)

    data, doc_id_to_filename = load_text_files(folder_path)

    cleaned_docs = []
    for doc_id, content in data.items():
        cleaned = clean_text(content)
        cleaned_docs.append(cleaned)
        print(f"Cleaned Doc {doc_id}: {cleaned[:80]}...")

    queries = load_queries(query_file)

    compute_similarity(cleaned_docs, queries, doc_id_to_filename)

if __name__ == "__main__":
    main()


Generated queries.txt at: C:\Users\Kritika Panta\Desktop\Week three assignment\queries.txt
Scanning folder: C:\Users\Kritika Panta\Desktop\Week three assignment
Loaded doc_id 0 -> 336 Cases of Violence against Women and Girls Committed During Lockdown.txt
Loaded doc_id 1 -> Apple unleashes M5, the next big leap in AI performance for Apple silicon.txt
Loaded doc_id 2 -> Incidents of social crime rise to 162 a day in Nepal.txt
Loaded doc_id 3 -> Social crimes are rising across the country, says police.txt
Loaded doc_id 4 -> Social crimes on the rise, but hard to intercept, police say.txt
Loaded doc_id 5 -> Wildlife risks grow along forested Rautahat highways.txt
Total documents loaded: 6
Cleaned Doc 0: case violence woman girl committed lockdown may may kathmandu lockdown worec doc...
Cleaned Doc 1: october apple unleashes next big leap ai performance apple silicon delivers peak...
Cleaned Doc 2: incident social crime rise day nepal rising nepal thu december staff reporterkat...
Cleaned