In [None]:
# Install necessary libraries
!pip install -U langchain langchain-community faiss-cpu sentence-transformers langchain-groq
!pip install requests beautifulsoup4

# Import libraries
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_groq import ChatGroq
from google.colab import userdata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import requests
from bs4 import BeautifulSoup

# Step 1: Download and parse the Python tutorial content
url = "https://docs.python.org/3/tutorial/index.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Collect tutorial section links
tutorial_links = [a['href'] for a in soup.select("a[href]") if "tutorial" in a['href']]

# Download each section of the tutorial for processing
tutorial_texts = []
for link in tutorial_links[:10]:  # Limiting to the first 10 pages for testing
    page_url = f"https://docs.python.org/3/tutorial/{link}"
    page_response = requests.get(page_url)
    page_soup = BeautifulSoup(page_response.text, 'html.parser')
    tutorial_texts.append(page_soup.get_text())

# Convert the collected texts into Document objects
documents = [Document(page_content=text) for text in tutorial_texts]

# Step 2: Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
split_docs = text_splitter.split_documents(documents)

# Step 3: Initialize the embedding model and create a FAISS vector store
embedding_model = "sentence-transformers/all-MiniLM-l6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# Create the FAISS vector store
vector_db = FAISS.from_documents(split_docs, embeddings)
retriever = vector_db.as_retriever(search_kwargs={"k": 2})

# Step 4: Initialize the language model
llm = ChatGroq(
    temperature=0,
    groq_api_key=userdata.get('groq'),
    model_name="mixtral-8x7b-32768"
)

# Step 5: Set up memory for conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')

# Step 6: Define the prompt template
template = """You are a nice chatbot having a conversation with a human. Answer the question based only on the following context and previous conversation. Keep your answers short and succinct.

Previous conversation:
{chat_history}

Context to answer question:
{context}

New human question: {question}
Response:"""
prompt = PromptTemplate(template=template, input_variables=["context", "question", "chat_history"])

# Step 7: Set up the conversational retrieval chain as per `chain_2` setup
chain_2 = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=False,  # Exclude source documents as per your setup
    combine_docs_chain_kwargs={"prompt": prompt}
)


Collecting langchain-community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.0-py3-none-any.whl.metadata (10 kB)
Collecting langchain-groq
  Downloading langchain_groq-0.2.1-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 k

  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')


In [None]:

# Start the conversation loop
print("Python Tutorial Chatbot. Type 'end' to end the conversation.\n")

while True:
    user_input = input("You: ")

    # Check for exit condition
    if user_input.lower() == 'end':
        print("Ending the conversation. Goodbye!")
        break

    # Get the response from the conversation chain
    response = chain_2.invoke({"question": user_input})

    # Print the chatbot's response
    print("Chatbot:", response["answer"])