In [1]:
# IMPORTANT:
# Run the install cell, then RESTART runtime before running the rest.
# Install necessary libraries
!pip install -U langchain langchain-community langchain-core \
langchain-huggingface langchain-text-splitters \
transformers accelerate bitsandbytes \
docx2txt sentence-transformers faiss-cpu



In [3]:
import os
import glob
import json
import torch
from langchain_community.document_loaders import TextLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import FAISS
from langchain_classic.memory import ConversationBufferMemory
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

print("⚙️ Initializing Model & Embeddings...")

# 1. Load Embeddings (for retrieval)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Load LLM (The Brain)
# Using GPT-2 for speed.
# TIP: If you have a GPU, change model_id to "TinyLlama/TinyLlama-1.1B-Chat-v1.0" for 10x smarter answers.
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    model_kwargs={"temperature": 0.7}
)

llm = HuggingFacePipeline(pipeline=pipe)
print("✅ Model Loaded Successfully!")

⚙️ Initializing Model & Embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


✅ Model Loaded Successfully!


In [4]:
def load_all_data():
    documents = []
    print("📂 Scanning for files...")

    # --- A. JSON Files (Structured Data) ---
    json_files = glob.glob("*.json")
    for filepath in json_files:
        try:
            filename = os.path.basename(filepath)
            category = filename.replace("cleaned_", "").replace(".json", "")

            with open(filepath, 'r') as f:
                data = json.load(f)

            for item in data:
                # Construct readable text from JSON keys
                content_parts = [f"Component Category: {category}"]
                for key, value in item.items():
                    if value:
                        content_parts.append(f"{key.replace('_', ' ').title()}: {value}")

                page_content = ". ".join(content_parts)

                # Add metadata
                metadata = item.copy()
                metadata["source"] = filename
                metadata["category"] = category

                documents.append(Document(page_content=page_content, metadata=metadata))
            print(f"   - Loaded {len(data)} items from {filename}")
        except Exception as e:
            print(f"   ! Error loading {filepath}: {e}")

    # --- B. Text Files (Legacy) ---
    if os.path.exists("compiled_data.txt"):
        loader = TextLoader("compiled_data.txt", encoding='utf-8')
        # Split by "--- " to keep products intact
        splitter = CharacterTextSplitter(separator="\n--- ", chunk_size=1000, chunk_overlap=0)
        docs = loader.load()
        documents.extend(splitter.split_documents(docs))
        print("   - Loaded compiled_data.txt")

    # --- C. Word Docs (Q&A) ---
    if os.path.exists("160 question of chatbot.docx"):
        loader = Docx2txtLoader("160 question of chatbot.docx")
        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        documents.extend(splitter.split_documents(loader.load()))
        print("   - Loaded 160 question of chatbot.docx")

    return documents

In [5]:
print("🧠 Building Vector Database...")

# 1. Run the loader function from Cell 2
raw_documents = load_all_data()

if not raw_documents:
    print("⚠️ WARNING: No data found! Please upload files.")
else:
    # 2. Create Vector Store
    vectorstore = FAISS.from_documents(raw_documents, embeddings)

    # 3. Create Retriever (MMR mode for diversity)
    retriever = vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={'k': 4, 'fetch_k': 20}
    )
    print(f"✅ Database Ready! Indexed {len(raw_documents)} chunks.")



🧠 Building Vector Database...
📂 Scanning for files...
   - Loaded compiled_data.txt
✅ Database Ready! Indexed 14441 chunks.


In [6]:
# Custom Prompt for the PC Builder Persona
custom_template = """You are a helpful PC Building Assistant.
Use the following pieces of context to answer the question at the end.
If the answer isn't in the context, say "I don't have that info."

Context:
{context}

Chat History:
{chat_history}

Question: {question}

Helpful Answer:"""

prompt = PromptTemplate(
    input_variables=["context", "chat_history", "question"],
    template=custom_template
)

# Memory to remember previous questions
memory = ConversationBufferMemory(
    memory_key="chat_history",
    output_key="answer",
    return_messages=True
)

# The Logic Chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": prompt},
    return_source_documents=True
)
print("✅ Bot is configured and ready.")

✅ Bot is configured and ready.


  memory = ConversationBufferMemory(


In [None]:
print("\n💬 SYSTEM READY! (Type 'exit' to quit)")
print("---------------------------------------")

while True:
    query = input("\nUser: ")
    if query.lower() in ["exit", "quit", "q"]:
        print("Bot: Happy building! Goodbye.")
        break

    try:
        # Ask the chain
        result = qa_chain.invoke({"question": query})

        # Print answer
        print(f"Bot: {result['answer']}")

        # Optional: Un-comment below to see WHICH file it read
        # source = result['source_documents'][0].metadata.get('source', 'Unknown')
        # print(f"[Debug: Info retrieved from {source}]")

    except Exception as e:
        print(f"Error: {e}")


💬 SYSTEM READY! (Type 'exit' to quit)
---------------------------------------
