<a href="https://colab.research.google.com/github/JamesDRodgers/Course-Projects-Gen-AI/blob/main/WebScrapeToVectorEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install -q langchain-openai beautifulsoup4 faiss-cpu tqdm

import requests
from bs4 import BeautifulSoup
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import json
from tqdm.notebook import tqdm
import time
from typing import Tuple, List, Optional, Dict
import logging
from google.colab import files

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class WebsiteEmbeddingCreator:
    def __init__(self, api_key: str):
        """Initialize the embedding creator with OpenAI API key."""
        self.api_key = api_key
        self.embeddings = OpenAIEmbeddings(openai_api_key=api_key)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )

    def scrape_website(self, url: str) -> Optional[str]:
        """Scrape and clean text from a website."""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        try:
            # Add retry mechanism
            for attempt in range(3):
                try:
                    response = requests.get(url, headers=headers, timeout=10)
                    response.raise_for_status()
                    break
                except requests.RequestException as e:
                    if attempt == 2:  # Last attempt
                        raise e
                    time.sleep(2 ** attempt)  # Exponential backoff

            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove unwanted elements
            for element in soup(['script', 'style', 'nav', 'footer', 'header']):
                element.decompose()

            # Extract text and clean it
            text = ' '.join(soup.stripped_strings)
            return text

        except Exception as e:
            logger.error(f"Error scraping website {url}: {e}")
            return None

    def create_embeddings(self, text: str) -> Tuple[List[str], List[List[float]]]:
        """Create embeddings from text with progress tracking."""
        try:
            # Split text into chunks
            texts = self.text_splitter.split_text(text)
            logger.info(f"Split text into {len(texts)} chunks")

            # Create embeddings with progress bar
            vectors = []
            for chunk in tqdm(texts, desc="Creating embeddings", leave=False):
                try:
                    vector = self.embeddings.embed_query(chunk)
                    vectors.append(vector)
                    time.sleep(0.1)  # Rate limiting
                except Exception as e:
                    logger.error(f"Error creating embedding for chunk: {e}")
                    vectors.append([0] * 1536)  # OpenAI embedding dimension

            return texts, vectors

        except Exception as e:
            logger.error(f"Error in embedding creation: {e}")
            return [], []

    def process_and_save(self, url: str, output_filename: str = "website_embeddings.json") -> None:
        """Main processing function."""
        try:
            # Scrape website
            logger.info(f"Scraping website: {url}")
            text = self.scrape_website(url)
            if not text:
                raise ValueError("No text retrieved from website")

            # Create embeddings
            texts, embeddings = self.create_embeddings(text)
            if not texts or not embeddings:
                raise ValueError("Failed to create embeddings")

            # Prepare output data
            output_data = {
                "metadata": {
                    "url": url,
                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
                    "num_chunks": len(texts)
                },
                "embeddings": [
                    {
                        "text": text,
                        "embedding": embedding
                    }
                    for text, embedding in zip(texts, embeddings)
                ]
            }

            # Save to file
            with open(output_filename, 'w') as f:
                json.dump(output_data, f)

            # Download file in Colab
            files.download(output_filename)

            logger.info(f"Successfully processed website and saved embeddings to {output_filename}")

            # Create FAISS index for similarity search
            embedding_dimension = len(embeddings[0])
            index = faiss.IndexFlatL2(embedding_dimension)
            embeddings_array = np.array(embeddings).astype('float32')
            index.add(embeddings_array)

            return index, texts  # Return index and texts for immediate use if needed

        except Exception as e:
            logger.error(f"Error in main processing: {e}")
            raise

def main():
    """Main execution function with user input handling."""
    try:
        # Get user inputs
        url = input("Enter the URL to scrape: ")
        api_key = input("Enter your OpenAI API Key: ")

        # Create processor and run
        processor = WebsiteEmbeddingCreator(api_key)
        index, texts = processor.process_and_save(url)

        print("\nProcessing complete! You can now use the index for similarity search.")
        print("Example usage:")
        print("query = 'your search query'")
        print("D, I = index.search(processor.embeddings.embed_query(query).reshape(1, -1), k=5)")
        print("similar_texts = [texts[i] for i in I[0]]")

    except Exception as e:
        logger.error(f"Error in main execution: {e}")
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()
