<a href="https://colab.research.google.com/github/Kavin56/GENERATIVE-AI/blob/main/Vector_Database/Weaviate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Dependencies

In [2]:
# Reinstall numpy first to ensure consistency
!pip uninstall -y numpy
!pip install numpy==1.26.0

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.26.0
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.0 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.0


In [3]:
!pip install langchain-google-genai

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
# Reinstall sentence-transformers and its dependencies in order
!pip uninstall -y transformers sentence-transformers
!pip install transformers==4.36.2
!pip install sentence-transformers==2.4.0

In [None]:
# Reinstall other dependencies
!pip install --force-reinstall weaviate-client langchain langchain-weaviate

In [None]:
!pip install --force-reinstall langchain-google-genai google-generativeai

In [None]:
!pip install --force-reinstall pdf2image

In [None]:
!pip install --force-reinstall pdfminer.six

In [None]:
!pip install --force-reinstall pymupdf

In [None]:
!pip install --force-reinstall langchain-community
!pip install PyPDF2 ipython
!pip install pypdf

# 2. Import necessary libraries

In [20]:
import os
import weaviate
from langchain_weaviate import WeaviateVectorStore
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from IPython.display import Markdown, display
import google.generativeai as genai
from langchain_community.embeddings import HuggingFaceEmbeddings

# 3. Vector Embeddings and Weaviate Connection Initialization

In [21]:
from google.colab import userdata  # For accessing Colab secrets

GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")
WEAVIATE_URL = userdata.get("WEAVIATE_URL")
WEAVIATE_API_KEY = userdata.get("WEAVIATE_API_KEY")

In [22]:
# Function to upload vectors to Weaviate
def upload_vectors(texts, embeddings, client):
    collections = client.collections.list_all()
    collection_names = collections

    # Create collection if it doesn't exist
    if 'Document' not in collection_names:
        client.collections.create(
            name="Document",
            properties=[
                {
                    "name": "text",
                    "dataType": ["text"]
                },
                {
                    "name": "source",
                    "dataType": ["text"]
                }
            ],
            vectorizer_config=None
        )
        print("Created Document collection in Weaviate")

    # Get the collection (no changes needed here)
    document_collection = client.collections.get("Document")

    vector_store = WeaviateVectorStore(
        client=client,
        index_name="Document",
        text_key="text",
        embedding=embeddings
    )

    print("Indexing PDF content... (this may take a bit) 🦙")
    for i in range(len(texts)):
        t = texts[i]
        metadata = {"source": t.metadata.get('source', 'unknown')}
        vector_store.add_texts([t.page_content], metadatas=[metadata])
        if (i + 1) % 5 == 0:  # Progress update every 5 chunks
            print(f"Processed {i + 1}/{len(texts)} chunks")

    print("Indexing complete!")
    return vector_store

In [23]:
# Configure Google Gemini API
genai.configure(api_key=GOOGLE_API_KEY)

In [24]:
# Function to initialize Weaviate client
def initialize_weaviate():
    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=WEAVIATE_URL,
        auth_credentials=weaviate.AuthApiKey(WEAVIATE_API_KEY),
    )
    print("Connected to Weaviate successfully")
    return client

In [25]:
# Function to load and process PDF
def process_pdf(pdf_path, client):
    print(f"Loading PDF: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    data = loader.load()

    # Split the data into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    texts = text_splitter.split_documents(data)
    print(f"Split PDF into {len(texts)} chunks")

    # Create embeddings - using a smaller model
    print("Loading embeddings...")
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    print(f"Successfully loaded {model_name} embeddings")

    # Upload to Weaviate
    vector_store = upload_vectors(texts, embeddings, client)
    return vector_store

In [26]:
# Function to ask questions about the PDF
def ask_question(vector_store, question):
    print(f"\nQuestion: {question}")

    # Perform similarity search
    docs = vector_store.similarity_search(question)

    # Load the question answering chain with Gemini
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-pro",
        google_api_key=GOOGLE_API_KEY,
        temperature=0.2
    )

    chain = load_qa_chain(llm, chain_type="stuff")

    # Query the documents and get the answer
    response = chain.run(input_documents=docs, question=question)

    print("\nAnswer:")
    display(Markdown(response))
    return response

# 4. Initialize Weaviate client

In [27]:
client = initialize_weaviate()

Connected to Weaviate successfully


# 5. Process your PDF


In [28]:
pdf_path = "/content/Herbally x AI - report.pdf"  # Make sure this matches your actual PDF file name
vector_store = process_pdf(pdf_path, client)

Loading PDF: /content/Herbally x AI - report.pdf
Split PDF into 19 chunks
Loading embeddings...




Successfully loaded sentence-transformers/all-MiniLM-L6-v2 embeddings
Indexing PDF content... (this may take a bit) 🦙
Processed 5/19 chunks
Processed 10/19 chunks
Processed 15/19 chunks
Indexing complete!


# 6. Now you can ask questions about it

In [29]:
question = "What is the main topic of this document?"  # Replace with your question
answer = ask_question(vector_store, question)


Question: What is the main topic of this document?

Answer:


This document discusses creating a prompt management system, implementing content approval workflows, scheduling and analytics (including building a scheduling service, implementing analytics collection, and creating a dashboard for performance monitoring), and risk assessment and mitigation.  It's difficult to give one main topic, as it covers several related aspects of a system, possibly a content generation or management system.