In [None]:
# Import necessary libraries
import os
import json
from langchain.document_loaders import PyMuPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

In [None]:
# Load environment variables from .env file
load_dotenv()

## Task 1: Source Discovery

In [None]:
# 1) Find all relevant sources about yourself

# List your personal documents
personal_docs = [
    "../data/linkedin_profile.pdf",
    "../data/personal_bio.txt",
    "../data/AIT_SIS_personal_info.txt"
]

# Load documents
documents = []
for doc_path in personal_docs:
    try:
        if doc_path.endswith('.pdf'):
            loader = PyMuPDFLoader(doc_path)
        elif doc_path.endswith('.txt'):
            loader = TextLoader(doc_path)
        else:
            print(f"Unsupported file type: {doc_path}")
            continue
        docs = loader.load()
        # Add source metadata to each document
        for doc in docs:
            doc.metadata["source"] = os.path.basename(doc_path)
        documents.extend(docs)
        print(f"Loaded: {doc_path}")
    except Exception as e:
        print(f"Error loading {doc_path}: {e}")

print(f"Total documents loaded: {len(documents)}")

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200
)
doc_chunks = text_splitter.split_documents(documents)
print(f"Created {len(doc_chunks)} document chunks")

In [None]:
# Set up embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create vector store
vectorstore = FAISS.from_documents(doc_chunks, embedding_model)

# Save vectorstore locally
vectorstore.save_local("vectorstore")
print("Vector store saved successfully")

In [None]:
template = """
You are a helpful assistant that answers questions about Mir Ali Use the following context and chat history to provide a gentle and informative response. If the context doesn't provide the answer, politely say you don't have enough information.

Context: {context}

Chat History: {chat_history}

Question: {question}

Answer:
"""
PROMPT = PromptTemplate(
    input_variables=["context", "chat_history", "question"],
    template=template
)

In [None]:
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    print("Error: GROQ_API_KEY environment variable not set.")
    exit()

groq_llm = ChatGroq(
    api_key=groq_api_key,
    model_name="llama-3.3-70b-versatile"
)

In [None]:
# Set up conversation memory
memory = ConversationBufferWindowMemory(
    k=5,
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"
)

In [None]:
# Create RAG chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=groq_llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    memory=memory,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": PROMPT}
)

In [None]:
# List retriever and generator models
print("Retriever Model: FAISS with HuggingFace embeddings (sentence-transformers/all-MiniLM-L6-v2)")
print("Generator Model: Groq's llama-3.3-70b-versatile")

# 5) Analyze potential issues
print("""
Analysis of Issues:
- Retriever: FAISS may retrieve irrelevant chunks if embeddings fail to capture semantic meaning accurately. This could happen with ambiguous questions or insufficient document detail.
- Generator: Groq's Llama3-70b might generate plausible but incorrect answers (hallucination) if retrieved context is incomplete.
- Mitigation: Ensure documents are comprehensive, adjust chunk size/overlap, or refine the prompt to prioritize context adherence.
""")

# Task 3: Chatbot Development

# Define the 10 required questions
questions = [
    "How old are you?",
    "What is your highest level of education?",
    "What major or field of study did you pursue during your education?",
    "How many years of work experience do you have?",
    "What type of work or industry have you been involved in?",
    "Can you describe your current role or job responsibilities?",
    "What are your core beliefs regarding the role of technology in shaping society?",
    "How do you think cultural values should influence technological advancements?",
    "As a master's student, what is the most challenging aspect of your studies so far?",
    "What specific research interests or academic goals do you hope to achieve during your time as a master's student?"
]

# Generate answers and store in JSON format
results = []
for i, question in enumerate(questions, start=1):
    response = qa_chain({"question": question})
    answer = response["answer"]
    results.append({
        "question_number": i,
        "question": question,
        "answer": answer
    })
    # Print the question number, question, and answer
    print(f"Question {i}: {question}")
    print(f"Answer: {answer}\n")

# Save the results to a JSON file
with open("answers.json", "w") as f:
    json.dump(results, f, indent=2)
print("Answers saved to 'answers.json'")

## Task 2: Analysis and Problem Solving

### 1) List of Retriever and Generator Models
- **Retriever Model:** FAISS with HuggingFace embeddings (sentence-transformers/all-MiniLM-L6-v2)
- **Generator Model:** Groq's llama-3.3-70b-versatile

### 2) Analysis of Issues

**Retriever Model (FAISS with all-MiniLM-L6-v2 embeddings):**
- Embedding Quality: The all-MiniLM-L6-v2 model is efficient but has limitations in capturing nuanced semantics compared to larger models
- Chunk Size Impact: Our 1000-character chunks with 200-character overlap may split contextual information across chunks
- Document Specificity: Personal documents may contain technical jargon or abbreviated information that embedding models might not accurately represent
- Retrieval K-value: Using k=3 might miss relevant information if semantically similar but irrelevant chunks score higher

**Generator Model (Groq's llama-3.3-70b-versatile):**
- Hallucination Risk: The model may generate plausible-sounding but incorrect information when context is incomplete
- Context Window Limitations: If the combined retrieved chunks exceed the model's context window, information may be truncated
- Prompt Sensitivity: The generator's responses can vary significantly based on prompt wording and structure
- Personality Alignment: The model may default to generic responses for personal questions if retrieved context lacks specific details

**Mitigation Strategies:**
- Experiment with different chunk sizes (500-1500 characters) and overlaps (100-300 characters) to find optimal settings
- Include more diverse personal documents to ensure comprehensive coverage of potential questions
- Consider fine-tuning the retriever parameters, such as adjusting the similarity threshold or increasing k for more context
- Implement answer validation by cross-referencing responses with known facts about yourself
- Enhance the prompt to explicitly instruct the model to acknowledge uncertainty rather than generating potentially incorrect details