In [3]:
pip install fastapi uvicorn beautifulsoup4 requests pdfplumber sentence-transformers

Collecting uvicorn
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ------------------ ------------------- 20.5/42.0 kB 640.0 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 675.6 kB/s eta 0:00:00
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
     ---------------------------------------- 0.0/48.5 kB ? eta -:--:--
     ---------------------------------------- 48.5/48.5 kB 2.4 MB/s eta 0:00:00
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-

In [7]:
pip install python-multipart

Collecting python-multipart
  Using cached python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)
Using cached python_multipart-0.0.9-py3-none-any.whl (22 kB)
Installing collected packages: python-multipart
Successfully installed python-multipart-0.0.9
Note: you may need to restart the kernel to use updated packages.


In [3]:
import nest_asyncio
import uvicorn

# Apply nest_asyncio to handle the event loop
nest_asyncio.apply()

# Your FastAPI app code goes here (as provided before)
from fastapi import FastAPI, UploadFile, File, HTTPException
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import pdfplumber
import sqlite3
from sentence_transformers import SentenceTransformer, util
import uuid

app = FastAPI()

# Initialize the sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Database setup: Create the content table
def create_db():
    conn = sqlite3.connect('app.db')
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS content (
                        chat_id TEXT PRIMARY KEY,
                        content TEXT NOT NULL
                      )''')
    conn.commit()
    conn.close()

create_db()

# Utility functions for storing and retrieving content
def store_content(chat_id: str, content: str):
    conn = sqlite3.connect('app.db')
    cursor = conn.cursor()
    cursor.execute("INSERT INTO content (chat_id, content) VALUES (?, ?)", (chat_id, content))
    conn.commit()
    conn.close()

def retrieve_content(chat_id: str):
    conn = sqlite3.connect('app.db')
    cursor = conn.cursor()
    cursor.execute("SELECT content FROM content WHERE chat_id = ?", (chat_id,))
    result = cursor.fetchone()
    conn.close()
    return result[0] if result else None

# Request model for processing URL
class URLProcessRequest(BaseModel):
    url: str

# Request model for chat functionality
class ChatRequest(BaseModel):
    chat_id: str
    question: str

# 1. API to process web content from a URL
@app.post("/process_url")
async def process_url(request: URLProcessRequest):
    # Scrape the web content
    try:
        response = requests.get(request.url)
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to scrape the URL: {str(e)}")

    # Generate unique chat ID
    chat_id = str(uuid.uuid4())

    # Store the scraped content in the database
    store_content(chat_id, text)

    return {"chat_id": chat_id, "message": "URL content processed and stored successfully."}

# 2. API to process and extract text from a PDF
@app.post("/process_pdf")
async def process_pdf(file: UploadFile = File(...)):
    try:
        with pdfplumber.open(file.file) as pdf:
            print('pd is reading')
            text = ' '.join(page.extract_text() for page in pdf.pages if page.extract_text())
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to process the PDF: {str(e)}")

    chat_id = str(uuid.uuid4())
    store_content(chat_id, text)

    return {"chat_id": chat_id, "message": "PDF content processed and stored successfully."}

# 3. API for chat functionality using embeddings
@app.post("/chat")
async def chat(request: ChatRequest):
    # Retrieve the stored content using the chat_id
    content = retrieve_content(request.chat_id)
    if not content:
        raise HTTPException(status_code=404, detail="No content found for the provided chat_id.")

    # Generate embeddings for the stored content and the question
    content_embedding = model.encode(content, convert_to_tensor=True)
    question_embedding = model.encode(request.question, convert_to_tensor=True)

    # Calculate similarity between the question and stored content
    similarity = util.pytorch_cos_sim(question_embedding, content_embedding)

    # If similarity is high enough, return the content; otherwise, say no relevant info found
    if similarity.item() > 0.3:
        response = content
    else:
        response = "No relevant information found."

    return {"response": response}

# Run the FastAPI app within the notebook
uvicorn.run(app, host="0.0.0.0", port=8000)


INFO:     Started server process [29348]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:59178 - "POST /process_url HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [29348]
