In [30]:
# Part 1: Dataset Creation (Inside the Notebook)
# Create a dataset of 10â€“15 short documents directly inside your Jupyter Notebook.

# Documents should be:
# One to three sentences long
# Written in natural language
# Focused on a single topic (e.g., company policies, student rules, product FAQs)
# Store the documents in a Python data structure (e.g., list of dictionaries).
# Assign each document a unique ID.

# %pip install python-docx
import os

documents1 = [
    {"id": 1, "text": "Company policies are designed to ensure a safe and productive work environment."},
    {"id": 2, "text": "Employees must adhere to the code of conduct at all times."},
    {"id": 3, "text": "Student rules include attending classes regularly and submitting assignments on time."}] 

documents = [
  { 
    id: 1, 
    "text": """Company policies are designed to ensure a safe and productive work environment. 
    All staff members must receive adequate training on these safety standards to protect themselves and others. 
    Management is ultimately responsible for maintaining equipment and infrastructure in good repair."""
  },
  { 
    id: 2, 
    "text": """Employees must adhere to the code of conduct at all times. 
    This includes maintaining professional behavior and avoiding any conflicts of interest that could harm the company's reputation. 
    Failure to comply with these ethical standards may result in formal disciplinary action."""
  },
  { 
    id: 3, 
    "text": """Student rules include attending classes regularly and submitting assignments on time.
    Persistent unexcused absences can lead to a significant reduction in a student's final grade. 
    It is essential to follow all formatting and submission guidelines to ensure work is accepted for evaluation."""
  }
];


In [31]:
# Part 2: Create an embedding representation for each document.

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Collect content from your previous file-reading loop
file_contents = [doc['text'] for doc in documents]

# 3. Generate embeddings for all documents at once
embeddings = model.encode(file_contents)

# 'embeddings' is now a list of vectors (arrays of numbers)
for i, embedding in enumerate(embeddings):
    print(f"Document {i+1} Vector Shape: {embedding.shape}". format(embedding=embedding), f"Vector Sample: {embedding[:5]}")  # Print first 5 elements of the vector sample


Document 1 Vector Shape: (384,) Vector Sample: [-0.03321188  0.03104994  0.00899956 -0.06051451  0.04233856]
Document 2 Vector Shape: (384,) Vector Sample: [-0.03446136  0.03237465 -0.04117622 -0.0477738  -0.0441071 ]
Document 3 Vector Shape: (384,) Vector Sample: [-0.02546375  0.09573723  0.02739025  0.02178335  0.06149345]


In [40]:
# Implement a basic retrieval mechanism that:
# Accepts a user query
# Computes similarity between the query and document embeddings
# Returns the most relevant document(s)
# Verify retrieval by printing:
#Total number of documents
# Sample retrieved documents for a test query


from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def retrieve_document(query, doc_texts, doc_embs, top_k=3):
    # Convert query to embedding
    query_emb = model.encode([query])
    
    # Calculate similarity scores
    similarities = cosine_similarity(query_emb, doc_embs).flatten()
    
    # Get indices of top_k most similar documents
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    return [(doc_texts[i], similarities[i]) for i in top_indices]

# --- Verification ---
user_query = "What is the main topic of the file?" # Test query

results = retrieve_document(user_query, file_contents, embeddings)

print(f"Total number of documents: {len(file_contents)}")
print("\n--- Sample Retrieved Document ---")
for doc, score in results:
    print(f"Similarity Score: {score:.4f}")
    print(f"Content: {doc[:500]}...") # Printing first 500 chars


Total number of documents: 3

--- Sample Retrieved Document ---
Similarity Score: 0.0779
Content: Student rules include attending classes regularly and submitting assignments on time.
    Persistent unexcused absences can lead to a significant reduction in a student's final grade. 
    It is essential to follow all formatting and submission guidelines to ensure work is accepted for evaluation....
Similarity Score: 0.0570
Content: Company policies are designed to ensure a safe and productive work environment. 
    All staff members must receive adequate training on these safety standards to protect themselves and others. 
    Management is ultimately responsible for maintaining equipment and infrastructure in good repair....
Similarity Score: 0.0184
Content: Employees must adhere to the code of conduct at all times. 
    This includes maintaining professional behavior and avoiding any conflicts of interest that could harm the company's reputation. 
    Failure to comply with these ethic

In [50]:
# Part 3: RAG Question Answering
# Define at least 5 user questions inside the notebook.

user_question1 = "Does this document contain the latest corporate HR policy" # Test query
user_question2 = "Is this document related to a user manual " # Test query
user_question3 = "Does this document relate to the latest news" # Test query
user_question4 = "Does this document relate to Student assignments" # Test query
user_question5 = "Is there any PII in this documents" # Test query

# For each question:
# Retrieve relevant document context
# Use the retrieved context to generate an answer
def generate_answer(context_similarity):
    if context_similarity > 0.4:
        return "I found relevant information in the document to answer your question."
    if context_similarity > 0.2:
            return "The document contains some related information, but it may not fully answer your question."
    if context_similarity > 0.1:
            return "The document does not contain relevant information to answer your question."
    if context_similarity <= 0.1:
            return "The document is not related to your question."

    return f"Generated answer for the question: '{question}' based on the context is '{context_similarity}'."

# Store results in a structured format (e.g., list or DataFrame) containing:
# question
# retrieved_context
# ground_truth
# generated_answer
results = []
for question in [user_question1, user_question2, user_question3, user_question4, user_question5]:
    retrieved_docs = retrieve_document(question, file_contents, embeddings, top_k=3)
    retrieved_context = retrieved_docs[0][0] if retrieved_docs else "No relevant document found."
    retreieved_context_similarity = retrieved_docs[0][1] if retrieved_docs else 0
    generated_answer = generate_answer(retreieved_context_similarity)
     
    print(f"Retrieved Context: {retrieved_context}")
    print(f"Question: {question}")
    print(f"Retrieved Context Similarity: {retreieved_context_similarity}")
    print(f"generated_answer: {generated_answer}")
    print(f"generated_answer: {"----------------"}")
    results.append({
        "question": question,
        "retrieved_context": retrieved_context,
        "ground_truth": "N/A",  # Placeholder for ground truth
        "generated_answer": generated_answer
    })

print(f"Results: {results}")


Retrieved Context: Company policies are designed to ensure a safe and productive work environment. 
    All staff members must receive adequate training on these safety standards to protect themselves and others. 
    Management is ultimately responsible for maintaining equipment and infrastructure in good repair.
Question: Does this document contain the latest corporate HR policy
Retrieved Context Similarity: 0.40257376432418823
generated_answer: I found relevant information in the document to answer your question.
generated_answer: ----------------
Retrieved Context: Company policies are designed to ensure a safe and productive work environment. 
    All staff members must receive adequate training on these safety standards to protect themselves and others. 
    Management is ultimately responsible for maintaining equipment and infrastructure in good repair.
Question: Is this document related to a user manual 
Retrieved Context Similarity: 0.21046273410320282
generated_answer: The do

In [None]:

#def read_and_print_files(filenames):
#    """Reads the contents of multiple files and prints them."""
#    for filename in filenames:
#        try:
#            # Open the file in read mode ('r')
#            with open(filename, 'r', encoding='utf-8') as file:
#                content = file.read()
#                print(f"--- Content of {filename} ---")
#                print(content)
#                print("-" * 30 + "\n")
#        except UnicodeDecodeError:
#            print(f"UnicodeDecodeError: The file '{filename}' was not open.")
#        except FileNotFoundError:
#            print(f"Error: The file '{filename}' was not found.")
#        except Exception as e:
#            print(f"An error occurred while reading '{filename}': {e}")

# List of the three files to read
#file_list = ['./documents/Customer_Journey_Map.docx', './documents/Privacy_Policy.docx', './documents/Terms_ And_Conditions.docx']

# Call the function to read the files
#read_and_print_files(file_list)




