In [2]:
import os
import pickle
from dotenv import load_dotenv

from llama_index.core import Document

# --- Configuration ---
# ✅ Step 1: Load environment variables from .env file
# Ensure your 'main.env' file contains:
# PINECONE_API_KEY="YOUR_PINECONE_API_KEY"
# PINECONE_ENV="YOUR_PINECONE_ENVIRONMENT" # e.g., "gcp-starter"
# PINECONE_INDEX="YOUR_PINECONE_INDEX_NAME"
# GOOGLE_API_KEY="YOUR_GOOGLE_API_KEY"

load_dotenv("main.env")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
PINECONE_INDEX = os.getenv("PINECONE_INDEX")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Validate environment variables are loaded
if not all([PINECONE_API_KEY, PINECONE_ENV, PINECONE_INDEX, GOOGLE_API_KEY]):
    raise ValueError(
        "One or more environment variables (PINECONE_API_KEY, PINECONE_ENV, "
        "PINECONE_INDEX, GOOGLE_API_KEY) are not set. "
        "Please check your 'main.env' file."
    )
print("Environment variables loaded and validated.")

# --- Data Loading ---
# ✅ Step 2: Load documents from pickle file
# This assumes 'scraped_data.pkl' contains a list of dictionaries,
# where each dictionary has 'content', 'url', and 'title' keys.
try:
    with open("scraped_data.pkl", "rb") as f:
        raw_data = pickle.load(f)
    print(f"Loaded {len(raw_data)} raw data entries from 'scraped_data.pkl'.")
except FileNotFoundError:
    raise FileNotFoundError(
        "scraped_data.pkl not found. Please ensure the scraped data "
        "is in the correct path."
    )
except Exception as e:
    raise IOError(f"Error loading scraped_data.pkl: {e}")

# Convert raw data into LlamaIndex Document objects
documents = []
for item in raw_data:
    content = item.get("content")
    if content: # Only process entries with content
        documents.append(
            Document(
                text=content,
                metadata={
                    "url": item.get("url", "N/A"),
                    "title": item.get("title", "N/A")
                }
            )
        )
print(f"Converted {len(documents)} entries into LlamaIndex Documents.")


Environment variables loaded and validated.
Loaded 472 raw data entries from 'scraped_data.pkl'.
Converted 472 entries into LlamaIndex Documents.


In [3]:
import pinecone
from pinecone import Pinecone as PineconeClient # Import Pinecone client for explicit initialization

from llama_index.core import StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Assuming PINECONE_API_KEY, PINECONE_ENV, PINECONE_INDEX are already loaded from previous step

# --- Pinecone Setup ---
# ✅ Step 3: Connect to Pinecone
# Using the new Pinecone client for explicit initialization
try:
    pc = PineconeClient(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    # Since the index is already created manually, we can directly connect to it.
    pc_index = pc.Index(PINECONE_INDEX)
    print(f"Successfully connected to existing Pinecone index: {PINECONE_INDEX}")

except Exception as e:
    raise ConnectionError(f"Failed to connect to Pinecone: {e}")

# --- Embedding Model and Vector Store ---
# ✅ Step 4: Setup vector store and embedding model
vector_store = PineconeVectorStore(pinecone_index=pc_index)
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
print(f"Using embedding model: {embed_model.model_name}")
print("Vector store and storage context configured.")


  from .autonotebook import tqdm as notebook_tqdm


Successfully connected to existing Pinecone index: boston-chatbot
Using embedding model: sentence-transformers/all-MiniLM-L6-v2
Vector store and storage context configured.


In [5]:
from llama_index.core import VectorStoreIndex

# Assuming 'documents', 'storage_context', and 'embed_model' are available from previous steps

# --- Index Creation/Loading ---
# ✅ Step 5: Build or load the index
# This part is crucial:
#
# Option A: Build and store the index from documents (run this if you need to index your data)
#   - Use this if this is the FIRST time you're running this script with your 'scraped_data.pkl'
#     and you want to upload the embeddings to Pinecone.
#   - This step can take a significant amount of time depending on the size of your data.
#   - Uncomment the lines below for Option A and comment out Option B.

print("Building index from documents and storing in Pinecone...")
index = VectorStoreIndex.from_documents(
    documents, # 'documents' from Step 2
    storage_context=storage_context, # 'storage_context' from Step 3
    embed_model=embed_model, # 'embed_model' from Step 3
    show_progress=True # Show progress during embedding and upserting
)
print("Index built and stored in Pinecone.")


# Option B: Load existing index from Pinecone (skip embedding again if already indexed)
#   - Use this if your documents have ALREADY been embedded and uploaded to Pinecone
#     in a previous run (e.g., from running Option A before).
#   - This is much faster as it doesn't re-embed your data.
#   - Uncomment the lines below for Option B and comment out Option A.

# print("Loading existing index from Pinecone...")
# index = VectorStoreIndex.from_vector_store(
#     vector_store=vector_store, # 'vector_store' from Step 3
#     storage_context=storage_context, # 'storage_context' from Step 3
#     embed_model=embed_model # 'embed_model' from Step 3
# )
# print("Index loaded from Pinecone.")



Building index from documents and storing in Pinecone...


Parsing nodes: 100%|█████████████████████████| 472/472 [00:00<00:00, 645.41it/s]
Generating embeddings: 100%|██████████████████| 589/589 [00:20<00:00, 28.43it/s]
Upserted vectors: 100%|███████████████████████| 589/589 [00:15<00:00, 37.77it/s]

Index built and stored in Pinecone.





In [12]:
# Corrected import for GoogleGenAI
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex # Import VectorStoreIndex if not already imported in this cell

# Assuming 'index', 'GOOGLE_API_KEY', and 'embed_model' are available from previous steps

# --- LLM Configuration ---
# ✅ Step 6: Set up Google Generative AI LLM
llm = GoogleGenAI(
    # The 'gemini-pro' model was not found.
    # Using 'models/gemini-1.5-pro-latest' which was listed as available in your diagnostic output.
    model="models/gemini-1.5-pro-latest", # Changed model name to a supported version
    api_key=GOOGLE_API_KEY,
    temperature=0.3,            # Adjust for creativity vs. factualness
    max_output_tokens=512       # Limit response length
)
print(f"Using LLM: {llm.model}")

# ✅ Step 7: Apply LLM and Embedding Model to LlamaIndex global settings
Settings.llm = llm
Settings.embed_model = embed_model # Ensure embedding model is also set globally
print("LlamaIndex settings configured (LLM and Embedding Model).\n")

# --- Query Engine ---
# ✅ Step 8: Create the query engine
# To retrieve relevant chunks along with the answer, we configure the response_mode.
# "tree_summarize" is a good default that synthesizes an answer and includes source nodes.
query_engine = index.as_query_engine(
    response_mode="tree_summarize", # This mode helps in getting source nodes
    # You can also explicitly set the retriever mode if needed, e.g., retriever_mode="embedding"
)
print("Query engine created.")

# --- Interactive Chatbot ---
# ✅ Step 9: Start the interactive chatbot
print("\n💬 Website Chatbot is ready! (type 'exit' or 'quit' to end)")
print("---")

while True:
    query = input("You: ")
    if query.strip().lower() in {"exit", "quit"}:
        print("👋 Goodbye!")
        break

    try:
        response = query_engine.query(query)
        print(f"Bot: {response}\n")

        # --- Added: Print Source Chunks ---
        if response.source_nodes:
            print("--- Retrieved Source Chunks ---")
            for i, node in enumerate(response.source_nodes):
                print(f"Chunk {i+1} (Score: {node.score:.2f}):")
                print(f"URL: {node.metadata.get('url', 'N/A')}")
                print(f"Title: {node.metadata.get('title', 'N/A')}")
                print(f"Text: {node.text[:200]}...\n") # Print first 200 chars of chunk text
        # --- End Added ---

    except Exception as e:
        print(f"⚠️ Error during query: {e}\n")


Using LLM: models/gemini-1.5-pro-latest
LlamaIndex settings configured (LLM and Embedding Model).

Query engine created.

💬 Website Chatbot is ready! (type 'exit' or 'quit' to end)
---


You:  services provided


Bot: The Office of Research provides workshops and training on research how-tos, and resources for communicating and disseminating research findings.  They offer accommodations such as ASL interpreters and CART upon request.  Researchers can also propose topics for future events.  For assistance with sharing research, various resources and offices at BU are available to help faculty broaden their reach.


--- Retrieved Source Chunks ---
Chunk 1 (Score: 0.35):
URL: https://www.bu.edu/research/events-updates-trainings/research-how-tos/
Title: Research How-to Workshops & Trainings | Office of Research
Text: For questions about accessibility or to request an accommodation (e.g., ASL interpreters, Communication Access Realtime Translation CART), please email research@bu.edu.   View past events, or learn mo...

Chunk 2 (Score: 0.31):
URL: https://www.bu.edu/research/communication-dissemination/
Title: Communication & Dissemination | Office of Research
Text: Ready to share your research with 

You:  what is the summary of this websit?


Bot: Boston University's financial aid website emphasizes its role as the main source of information for students seeking financial assistance.  The Human Resources site highlights employee benefits, flexible work options, and access to newsletters.


--- Retrieved Source Chunks ---
Chunk 1 (Score: 0.38):
URL: https://www.bu.edu/finaid/how-aid-works/
Title: How Aid Works | Financial Assistance
Text: We encourage you to treat our website as your primary source of information. If you come across any stumbling blocks, we’re here with answers and guidance....

Chunk 2 (Score: 0.35):
URL: https://www.bu.edu/hr/
Title: Human Resources
Text: loading slideshow... BU Total Rewards & myFiTage Flexible Work at Boston University BUHR Newsletter Archive...



You:  exit


👋 Goodbye!
