In [7]:
from dotenv import load_dotenv
load_dotenv()

import os
import requests
from pinecone import Pinecone, ServerlessSpec

In [29]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Delete the existing index if it exists
pc.delete_index("rag")

# Create a new Pinecone index with dimension 512
pc.create_index(
    name="rag",
    dimension=512,  # Set the dimension to match your embeddings
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)


In [3]:
import json
# Load the review data
data = json.load(open("reviews.json"))

data['reviews']

[{'professor': 'Dr. Emily Clark',
  'review': 'Engaging lectures and clear explanations. Highly recommended!',
  'subject': 'Biology',
  'stars': 5},
 {'professor': 'Prof. Michael Chen',
  'review': 'Challenging course, but fair grading. Office hours were helpful.',
  'subject': 'Physics',
  'stars': 4},
 {'professor': 'Dr. Sarah Johnson',
  'review': 'Interesting content, but lectures could be more organized.',
  'subject': 'Psychology',
  'stars': 3},
 {'professor': 'Prof. David Thompson',
  'review': "Tough grader, but you'll learn a lot if you put in the effort.",
  'subject': 'Mathematics',
  'stars': 4},
 {'professor': 'Dr. Lisa Patel',
  'review': 'Excellent at relating course material to real-world scenarios.',
  'subject': 'Business',
  'stars': 5},
 {'professor': 'Prof. Robert Wilson',
  'review': 'Knowledgeable, but not very approachable. Assignments were unclear.',
  'subject': 'Computer Science',
  'stars': 2},
 {'professor': 'Dr. Maria Rodriguez',
  'review': 'Passionate 

In [21]:
# Fetch the API key from the environment variables
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Check if the key was loaded correctly
if not GOOGLE_API_KEY:
    raise ValueError("Google API Key not found. Please check your .env file.")

# Define the API endpoint URL
embedding_api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={GOOGLE_API_KEY}"

# Initialize an empty list to hold the processed data
processed_data = []

# Create content for each review
for review in data["reviews"]:
    response = requests.post(
        embedding_api_url,
        json={
            "contents": [
                {
                    "parts": [
                        {"text": review['review']}  # Pass the review text here
                    ]
                }
            ]
        }
    )

    if response.status_code == 200:
        # Extract the generated content from the response
        candidates = response.json().get('candidates', [])
        if candidates:
            content_parts = candidates[0].get('content', {}).get('parts', [])
            if content_parts:
                generated_content = content_parts[0].get('text', '')
                processed_data.append(
                    {
                        "values": generated_content,  # The generated content
                        "id": review["professor"],
                        "metadata": {
                            "review": review["review"],
                            "subject": review["subject"],
                            "stars": review["stars"],
                        }
                    }
                )
    else:
        print(f"Error {response.status_code}: {response.text}")

# Now you have the processed data with the generated content for each review.

In [22]:
processed_data[0]

{'values': "Thank you for the kind words! I'm glad you found the lectures engaging and the explanations clear.  \n\nI'm always looking for ways to improve, so would you mind sharing a bit more about what made the lectures particularly engaging and clear for you? \n\nFor example:\n\n* What was the topic of the lectures?\n* What specific aspects of the lectures did you find engaging?\n* What made the explanations clear and easy to understand? \n\nYour feedback helps me understand what resonates with audiences and how I can continue to deliver high-quality learning experiences. 😊 \n",
 'id': 'Dr. Emily Clark',
 'metadata': {'review': 'Engaging lectures and clear explanations. Highly recommended!',
  'subject': 'Biology',
  'stars': 5}}

In [31]:
def convert_to_embedding(text):
    # Replace this with your actual embedding logic.
    # For example, if you're generating embeddings from a model,
    # use the model's output here.
    return [0.1] * 512  # Example of a non-zero embedding vector

# Process the data and prepare it for upserting to Pinecone
valid_vectors = []
for item in processed_data:
    embedding = convert_to_embedding(item["values"])  # Convert the content to embedding
    
    # Ensure the embedding is not a zero vector
    if any(embedding):  # This checks if there's at least one non-zero value
        item["values"] = embedding
        valid_vectors.append(item)

# Insert the valid embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=valid_vectors,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())


Upserted count: 20
{'dimension': 512,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
