In [None]:
import pymongo 
# import fitz  # PyMuPDF
# import re
# from sklearn.feature_extraction.text import TfidfVectorizer # import sklearn


connect the DB


In [None]:
client = pymongo.MongoClient('')
db = client.sample_mflix
collection = db.news
print(collection)

In [None]:
# !pip install pymupdf
# !pip install pymongo

In [None]:
import os
import re
# from nltk.corpus import stopwords
import nltk

# nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
# from pdfminer.high_level import extract_text

In [None]:
from collections import defaultdict
import joblib

In [None]:
# Initialize NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer(max_features=5000)

# Function to preprocess text using NLTK
def preprocess_text_nltk(text):
    # Tokenize, remove stop words, and lemmatize
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered_tokens)

# Function to extract keywords with TF-IDF and rank them
def extract_keywords_from_folder(documents_db, num_keywords=10):
    # Read and preprocess all PDF documents in the folder
    documents = []
    objectId = []
    
    inverted_index = defaultdict(list)
    
    for id, doc in enumerate(documents_db):
        processed_text = preprocess_text_nltk(doc['plain_text'])
        documents.append(processed_text)
        objectId.append(doc['_id'])

    
    # Apply TF-IDF to the collection of documents

    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_array = vectorizer.get_feature_names_out()
    
    # Rank and display keywords with scores for each document
    for i, doc in enumerate(documents):
        tfidf_scores = tfidf_matrix[i].toarray()[0]
        top_indices = tfidf_scores.argsort()[::-1][:num_keywords]
        for index in top_indices:
            keyword = feature_array[index]
            score = tfidf_scores[index]
            inverted_index[keyword].append({'doc_id': objectId[i], 'score': score})
    
    for keyword, doc_list in inverted_index.items():
        print(f"\nKeyword: {keyword}")
        for entry in doc_list:
            print(f"Document ID: {entry['doc_id']}, Score: {entry['score']:.4f}")
    
    return inverted_index









In [None]:
# Fetch all documents in the 'news' collection
documents_db = collection.find()

# Iterate through the documents and print them

inverted_idx_unsorted = extract_keywords_from_folder(documents_db, num_keywords=10)


In [None]:
joblib.dump(vectorizer, 'vectorizer.joblib')

In [None]:
import json

In [None]:
def sort_inverted_index_by_score(inverted_index):
    """
    Sort the inverted index based on the document scores for each keyword.

    Parameters:
        inverted_index (dict): The inverted index containing keywords, document IDs, and scores.

    Returns:
        dict: A new inverted index with sorted document entries for each keyword.
    """
    updated_index = {}
    for keyword, doc_list in inverted_index.items():
        updated_doc_list = []
        for entry in doc_list:
            # Convert Document ID to string
            updated_entry = {
                'doc_id': str(entry['doc_id']),
                'score': entry['score']
            }
            updated_doc_list.append(updated_entry)
        updated_index[keyword] = updated_doc_list
        
    sorted_updated_index = {}
    for keyword, doc_list in updated_index.items():
        # Sort the document entries for the keyword by score in descending order
        sorted_doc_list = sorted(doc_list, key=lambda x: x['score'], reverse=True)
        sorted_updated_index[keyword] = sorted_doc_list
    
    file_path = 'inverted_index_output_sorted.json'
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(sorted_updated_index, json_file, indent=4, ensure_ascii=False)

    return sorted_updated_index


In [None]:
sorted_index = sort_inverted_index_by_score(inverted_idx_unsorted)

Query processing and searching in Inverted Index

In [None]:
def preprocess_query(query):
    """Preprocess the query to match the terms used in the inverted index."""
    # Lowercase and remove special characters
    query = query.lower()
    query = re.sub(r'\W+', ' ', query)
    return query.split()

def extract_keywords_from_query(query, vectorizer, inverted_index, num_results=10):
    """
    Extract keywords from a query, search the inverted index, and return relevant documents.
    
    Parameters:
        query (str): The user query.
        vectorizer (TfidfVectorizer): The vectorizer used to build the inverted index.
        inverted_index (dict): The inverted index containing keywords, document IDs, and scores.
        num_results (int): Number of top results to return.
    Returns:
        List of relevant documents with scores.
    """
    # Preprocess the query
    processed_query = preprocess_query(query)
    
    # Vectorize the query to identify relevant keywords
    query_vector = vectorizer.transform([' '.join(processed_query)]).toarray()[0]
    feature_array = vectorizer.get_feature_names_out()
    
    # Extract keywords with non-zero TF-IDF scores
    query_keywords = {feature_array[i]: query_vector[i] for i in range(len(feature_array)) if query_vector[i] > 0}
    
    # Search in the inverted index
    results = defaultdict(float)
    for keyword, score in query_keywords.items():
        if keyword in inverted_index:
            for doc_entry in inverted_index[keyword]:
                doc_id = str(doc_entry['doc_id'])
                results[doc_id] += score * doc_entry['score']  # Combine query and document scores

    # Sort results by relevance
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)[:num_results]
    
    return sorted_results


In [None]:
query = "artificial intelligence and deep learning"
results = extract_keywords_from_query(query, vectorizer, sorted_index, num_results=5)

# Display results
print("Query Results:")
for doc_id, score in results:
    print(f"Document ID: {doc_id}, Relevance Score: {score:.4f}")

In [None]:
print(vectorizer)