In [None]:
# Part 1: Environment Setup and Dependencies
%%capture
!pip install openai==1.55.3 httpx==0.27.2
!pip install pinecone-client==3.0.0
!pip install langchain==0.3.13
!pip install tiktoken==0.5.1
!pip install flask_restful
!pip install langchain-community
!pip install html2text flask requests flask-ngrok pyngrok flask-cors

import os
import openai
from pinecone import Pinecone, ServerlessSpec
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
import io

from flask import Flask, request, jsonify
from flask_restful import Api, Resource
from bs4 import BeautifulSoup
import requests
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from langchain_community.document_transformers import BeautifulSoupTransformer
from flask_ngrok import run_with_ngrok
from pyngrok import ngrok
from flask_cors import CORS

# Get API keys
openai_api_key = "openAI-key" #Enter the key
pinecone_api_key = "pinecone-key" #Enter the key
ngrok_auth_token = "ngrok-key" #Enter the key



In [None]:
# Part 2: Configuration and API Setup
def setup_apis(openai_api_key, pinecone_api_key):
    """
    Initialize OpenAI and Pinecone with API keys and environment settings
    Returns initialized Pinecone client
    """
    openai.api_key = openai_api_key
    pc = Pinecone(api_key=pinecone_api_key)
    return pc

In [None]:
# Part 3: URL Data Extraction and Splitting into Chunks
def extract_webpage_content(url):
    """Extract content from webpage using Langchain's AsyncHtmlLoader"""
    # Load HTML content
    loader = AsyncHtmlLoader([url])
    docs = loader.load()

    # Transform HTML to readable text
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["div"]
    )

    text_chunks = split_documents(docs_transformed)
    return text_chunks

def split_documents(text, chunk_size=1000, chunk_overlap=200):
    """
    Split text into smaller chunks for processing
    Returns list of text chunks
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    chunks = text_splitter.split_documents(text)
    print(f"URL Document split into {len(chunks)} chunks")
    return chunks

In [None]:
# Part 4: Vector Creation and Storage
def create_embeddings(text_chunks):
    """
    Generate embeddings for text chunks using OpenAI's embedding model
    """
    embeddings = []
    for chunk in text_chunks:
        # Extract the text content from the Document object
        chunk_text = chunk.page_content
        response = openai.embeddings.create(
            input=chunk_text,
            model="text-embedding-3-small"
        )
        embeddings.append(response.data[0].embedding)
    return embeddings

def setup_pinecone_index(pc, index_name, dimension=1536):
    """
    Create or connect to Pinecone index
    """
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
    return pc.Index(index_name)

def upload_to_pinecone(index, text_chunks, embeddings):
    """
    Upload text chunks and their embeddings to Pinecone
    """
    for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings)):
        index.upsert(vectors=[{
            'id': f'chunk_{i}',
            'values': embedding,
            'metadata': {'text': chunk.page_content}
        }])

In [None]:
# Part 5: Query Processing
def process_query(query, index):
    """
    Process user query and retrieve relevant context
    """
    # Create query embedding
    query_embedding = openai.embeddings.create(
        model="text-embedding-3-small",
        input=query
    ).data[0].embedding

    # Search Pinecone
    results = index.query(
        vector=query_embedding,
        top_k=3,
        include_metadata=True
    )

    # Extract relevant contexts
    contexts = [match.metadata['text'] for match in results.matches]
    return contexts

In [None]:
# Part 6: Response Generation
def generate_response(query, contexts):
    """
    Generate response using OpenAI's GPT model with retrieved contexts
    """
    prompt = f"""Based on the following contexts, answer the question.

Contexts:
{' '.join(contexts)}

Question: {query}

Answer:"""

    response = openai.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=500
    )

    return response.choices[0].message.content

In [None]:
# Part 7: Enhanced Interactive Interface

global pc_index

def run_chatbot_system(url):
    """
    Main function to run the QA system interactively with enhanced file upload
    """
    print("Welcome to the Smart Question-Answering System!")
    print("--------------------------------------------")

    # Setup APIs
    print("\nInitializing APIs...")
    pc = setup_apis(openai_api_key, pinecone_api_key)

    # Process URL Data with Langchain
    print("\nPDF Loading URL Data and Processing")
    print("------------------------")
    try:
        chunks = extract_webpage_content(url)
        for chunk in chunks:
            print(chunk.page_content)
    except Exception as e:
        print(f"Error processing URL data: {str(e)}")
        return

    # Create embeddings
    print("\nCreating document embeddings...")
    embeddings = create_embeddings(chunks)
    print(f"Created {len(embeddings)} embeddings")

    # Setup Pinecone
    print("\nSetting up vector database...")
    pc_index = setup_pinecone_index(pc, "my-rag-test")
    print("Vector database ready!")

    # Upload to Pinecone
    print("\nUploading to vector database...")
    upload_to_pinecone(pc_index, chunks, embeddings)
    print("Uploaded to Vector database!")

In [None]:
# Part 8: Flask based Chatbot Setup

# Configure ngrok
# Replace with your authtoken from ngrok dashboard
ngrok.set_auth_token(ngrok_auth_token)

# Flask API setup
app = Flask(__name__)
CORS(app, resources={
    r"/chat": {
        "origins": "*",
        "methods": ["POST", "OPTIONS"],
        "allow_headers": ["Content-Type", "Authorization"]
    }
})
api = Api(app)
run_with_ngrok(app)

# Initialize ngrok
try:
    # Kill any existing ngrok processes
    ngrok.kill()

    # Start a new ngrok tunnel
    ngrok_tunnel = ngrok.connect(addr=f"127.0.0.1:5000", proto="http")
    print('Public URL:', ngrok_tunnel.public_url)
except Exception as e:
    print(f"Error setting up ngrok: {str(e)}")
    raise

class ChatbotAPI(Resource):
    def __init__(self):
        pc = setup_apis(openai_api_key, pinecone_api_key)
        pc_index = setup_pinecone_index(pc, "my-rag-test")
        self.pinecone_index = pc_index

    def post(self):
        data = request.get_json()
        if not data:
            return {"error": "No JSON data received"}, 400

        query = data.get('query')
        if not query:
            return {"error": "No query provided"}, 400

        if query == "test":
                # Respond immediately to test requests
                return {"status": "connected", "response": "Test successful"}, 200

        try:
            print("\nSearching for relevant information...")
            contexts = process_query(query, self.pinecone_index)
            print("Generating response...")
            response = generate_response(query, contexts)
            print(f"\nAnswer: {response}")

            # Return properly structured response
            return {
                "success": True,
                "response": response,
                "error": None
            }, 200

        except Exception as e:
            print(f"Error processing request: {str(e)}")
            return {"error": str(e)}, 500

api.add_resource(ChatbotAPI, '/chat')

if __name__ == "__main__":
    # First-time setup
    url = "https://brainlox.com/courses/category/technical"
    run_chatbot_system(url)

    # Run Flask app
    app.run()

Public URL: https://ff6e-34-68-130-135.ngrok-free.app
Welcome to the Smart Question-Answering System!
--------------------------------------------

Initializing APIs...

PDF Loading URL Data and Processing
------------------------


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.27it/s]


URL Document split into 26 chunks
Courses (/courses) Technical (/courses/category/technical) Academic (/courses/category/academic) Language (/courses/category/language) Music (/courses/category/music) Lifestyle (/courses/category/lifestyle) Book a Free Demo Now (/book-free-demo) Sign In (/signin) FAQ (/faq) Contact Us (/contact) Practice Python (https://codenow.brainlox.com/) Learn Now (https://learn.brainlox.com/) Home (/) Courses Courses We found great courses available for you $  20 per session LEARN SCRATCH PROGRAMING (/courses/4f629d96-5ed9-4302-ae0e-3479c543a49e) Scratch Course is the foundation of coding and is a building block of a coding journey. If you want   16  Lessons View Details (/courses/4f629d96-5ed9-4302-ae0e-3479c543a49e) $  30 per session LEARN CLOUD COMPUTING BASICS-AWS (/courses/872d1cb6-8469-4797-b267-8c41837b10e2) In this course we are going to cover the basics and the most important services on AWS, At the end   20  Lessons View Details (/courses/872d1cb6-8469-

 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://ff6e-34-68-130-135.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040
