# Pinecone Vector Store Creation

In [None]:
!pip install langchain
!pip install langchain-core
!pip install langchain-community
!pip install langchain-pinecone
!pip install sentence-transformers

### Initialize pinecone vector store

In [None]:
from google.colab import userdata
pinecone_api_key = userdata.get('pinecone_api_key')

In [None]:
from pinecone import Pinecone, ServerlessSpec
import time
# from config import VECTOR_DIMENSION
VECTOR_DIMENSION = 384 # 768 do if nomic embeddings using
class PineconeManager:
    def __init__(self, api_key: str, index_name: str):
        self.pc = Pinecone(api_key=api_key)
        self.index_name = index_name
        self.index = None
        self.initialize_index() #to initialize the index
    def initialize_index(self):
        if self.index_name not in self.pc.list_indexes().names():#shd use list indexes here
            print(f"Creating index: {self.index_name}")
            self.pc.create_index(
                name=self.index_name,
                dimension=VECTOR_DIMENSION,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1"
                )
            )
        else:
            print(f"Index {self.index_name} already exists")

        while not self.pc.describe_index(self.index_name).status['ready']:
            time.sleep(1)

        self.index = self.pc.Index(self.index_name)


In [None]:
INDEX_NAME = 'dlprojectcheck'
pinecone_manager = PineconeManager(pinecone_api_key, INDEX_NAME)
pinecone_manager.initialize_index()

### Scraping and inseting into VectorStore

In [None]:
import os
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from typing import List, Dict
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

class WebScraper:
    def __init__(self):
        pass

    @staticmethod
    async def extract_sections(soup: BeautifulSoup) -> List[Dict[str, str]]:
        """Extract sections from a parsed HTML page.

        Sections are defined as the contents between headers (h1, h2, h3) and are
        dictionaries with keys "title", "content", and "code". The value of "title"
        is the text of the header, "content" is the text content of the section, and
        "code" is the code blocks in the section.

        :param soup: A BeautifulSoup object representing the HTML page
        :return: A list of section dictionaries
        """
        sections = []
        current_section = {"title": "Introduction", "content": "", "code": ""}

        for element in soup.find_all(['h1', 'h2', 'h3', 'p', 'pre']):
            if element.name in ['h1', 'h2', 'h3']:
                if current_section["content"] or current_section["code"]:
                    sections.append(current_section)
                current_section = {"title": element.get_text(strip=True), "content": "", "code": ""}
            elif element.name == 'p':
                current_section["content"] += element.get_text(strip=True) + "\n"
            elif element.name == 'pre':
                code = element.find('code')
                if code:
                    current_section["code"] += code.get_text(strip=True) + "\n\n"

        if current_section["content"] or current_section["code"]:
            sections.append(current_section)

        return sections

    def create_documents(self, sections: List[Dict[str, str]], file_path: str) -> List[Document]:
        """Create a list of Document objects from a list of sections and a file path.

        The content of each section is combined with its code block (if any) and
        used to create a Document object. The metadata of the Document includes
        the title of the section and the source file path.

        :param sections: A list of dictionaries with keys "title", "content", and "code"
        :param file_path: The path to the file containing the sections
        :return: A list of Document objects
        """
        documents = []
        for section in sections:
            content = section["content"]
            code = section["code"]
            combined_text = f"{content}\n\nCode:\n{code}"

            document = Document(
                page_content=combined_text,
                metadata={
                    "title": section["title"],
                    "source": file_path
                }
            )
            documents.append(document)
        
        return documents

    async def scrape_file(self, file_path: str) -> List[Document]:
        """Scrape a file and return a list of Document objects.

        The file is read and parsed with BeautifulSoup, and then the sections
        are extracted and combined into Document objects. The metadata of each
        Document includes the title of the section and the source file path.

        :param file_path: The path to the file to scrape
        :return: A list of Document objects
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                html = file.read()
                soup = BeautifulSoup(html, 'html.parser')
                sections = await self.extract_sections(soup)

                documents = self.create_documents(sections, file_path)
                return documents
        except Exception as e:
            print(f"Error scraping {file_path}: {str(e)}")
            return []

    async def scrape_files(self, file_paths: List[str]) -> List[Document]:
        """Scrape a list of files and return a list of Document objects.

        This function will scrape each file in the list in parallel using asyncio.
        The documents from each file are collected and returned as a single list.

        :param file_paths: A list of file paths to scrape
        :return: A list of Document objects
        """
        tasks = [self.scrape_file(file_path) for file_path in file_paths]
        documents_list = await asyncio.gather(*tasks)
        all_documents = [doc for sublist in documents_list for doc in sublist]
        return all_documents


In [None]:
from google.colab import drive
import os

# Mount your Google Drive
drive.mount('/content/drive')

In [None]:
scraper = WebScraper()
dir_to_work_with = '/content/drive/MyDrive/api'
html_files = []
for root, dirs, files in os.walk(dir_to_work_with):
    for file in files:
        if file.endswith(".html"):
            html_files.append(os.path.join(root, file))
print("the html files got from directory and embedding initialized\n")

print("document scrape started")
documents = await scraper.scrape_files(file_paths=html_files)
print("documents scrape done \n")

### Initializing embeddings and upserting the documents

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=pinecone_manager.index, embedding=embeddings)

vector_store.add_documents(documents)

###  Testing the vector store

In [None]:
query = "How does keras handle Layers?"
vector_store.similarity_search_with_relevance_scores(query)