In [45]:
import os
import requests
import json
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec, PineconeApiException
import google.generativeai as genai
# from llama_index.llms.gemini import Gemini
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import StorageContext, VectorStoreIndex, download_loader, Settings
from bs4 import BeautifulSoup

load_dotenv()

True

In [46]:
# Function to load data based on input type (URL or JSON file)
def load_data(source):
    # Check if the input source is a URL
    if source.startswith('http'):
        # Scrape data from the web
        response = requests.get(source)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Implement scraping logic here to extract relevant data
        # For simplicity, assuming we get a list of documents as plain text
        documents = [{'content': p.get_text()} for p in soup.find_all('p')]
    else:
        # Load data from a JSON file
        with open(source, 'r') as f:
            data = json.load(f)
        if isinstance(data, dict):
            documents = [{'content': review['review']} for review in data.get('reviews', [])]
        elif isinstance(data, list):
            documents = [{'content': item.get('review', '')} for item in data]
        else:
            raise ValueError("Invalid JSON format.")
    return documents

In [52]:
# Function to process data, create embeddings, and store them in Pinecone
def process_and_store_data(documents, pinecone_index, genai):
    # Process and store data
    processed_data = []
    embed_model = genai

    for doc in documents:
        response = embed_model.embed_content(
            model="models/text-embedding-004",
            content=doc['content']
        )
        embedding = response['embedding']
        processed_data.append({
            "values": embedding,
            "id": doc.get('id', 'default_id'),
            "metadata": {
                "content": doc['content']
            }
        })
    
    # Insert the embeddings into the Pinecone index
    upsert_response = pinecone_index.upsert(
        vectors=processed_data,
        namespace="ns1",
    )
    print(f"Upserted count: {upsert_response['upserted_count']}")
    
    return pinecone_index

In [59]:
# Function to query the index and get a response from Gemini
def query_index(index, query, genai):
    query_embedding = genai.embed_content(
        model="models/text-embedding-004",
        content=query
    )['embedding']
    
    results = index.query(
        top_k=5,  # Number of top results to return
        vector=query_embedding,
        namespace="ns1"
    )
    return results

In [49]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "rag"
dimension = 768

existing_indexes = pc.list_indexes()
index_exists = any(index['name'] == index_name for index in existing_indexes)
if not index_exists:
    try:
        # Create a Pinecone index
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )
        print(f"Index '{index_name}' created.")
    except PineconeApiException as e:
        if e.status == 409:
            print(f"Index '{index_name}' already exists.")
        else:
            raise

pinecone_index = pc.Index(index_name)


In [50]:
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [57]:
input_source = "reviews.json"
documents = load_data(input_source)

# Process the data and store it in Pinecone
index = process_and_store_data(documents, pinecone_index, genai)

Upserted count: 20


In [61]:
# Query the index
query = "Who is the best metaphysics professor?"
response = query_index(index, query, genai)
print(f'response: {response}')

response: {'matches': [{'id': 'default_id', 'score': 0.479686022, 'values': []}],
 'namespace': 'ns1',
 'usage': {'read_units': 5}}
