In [15]:
from dotenv import load_dotenv
load_dotenv()
import os
import google.generativeai as genai
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

In [16]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

pc.create_index(
    name="rag",
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [19]:
import json

data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Johnson's lectures are engaging and informative. She always makes time for student questions."},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Brilliant instructor! Prof. Chen's projects are challenging but incredibly rewarding."},
 {'professor': 'Dr. Sarah Williams',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Dr. Williams knows her subject well, but her assignments can be unclear at times.'},
 {'professor': 'Prof. David Martinez',
  'subject': 'History',
  'stars': 4,
  'review': 'Prof. Martinez brings history to life with his storytelling approach. Very enjoyable classes.'},
 {'professor': 'Dr. Rachel Thompson',
  'subject': 'Chemistry',
  'stars': 5,
  'review': "Dr. Thompson's lab sessions are extremely well-organized. She explains complex concepts clearly."},
 {'professor': 'Prof. James Wilson',
  'subject': 'Mathematics',
  'star

In [20]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

processed_data = []

# Iterate over each review to generate embeddings
for review in data['reviews']:
    result = genai.embed_content(
        model="models/text-embedding-004",  # Adjust model as needed
        content=review['review'],
        task_type="retrieval_document",
        title="Embedding of review"
    )
    embedding = result['embedding']  # Extract the embedding from the result
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [21]:
processed_data[0]

{'values': [0.023967255,
  -0.017527107,
  -0.01971013,
  -0.017732542,
  0.03269349,
  0.00818248,
  0.015264568,
  0.07518172,
  -0.040246945,
  0.033064548,
  0.032380037,
  0.040824562,
  0.04816548,
  0.0076627093,
  -0.043378446,
  -0.07608679,
  0.026006622,
  -0.0120255975,
  -0.09139815,
  0.044137195,
  -0.006055754,
  -0.063582055,
  0.04451715,
  -0.063695624,
  0.021133251,
  -0.05209923,
  0.03185734,
  -0.03351962,
  0.048769087,
  -0.043620285,
  0.055412307,
  0.02813075,
  0.029703928,
  -0.042008534,
  -0.038064405,
  0.05785566,
  0.00037700287,
  -0.0009388646,
  0.027597953,
  -0.0507534,
  -0.05059085,
  0.031100258,
  0.025353706,
  0.029887978,
  -0.08338094,
  -0.011002258,
  -0.011040136,
  0.113149546,
  0.014815578,
  0.083284356,
  0.05889687,
  0.04941362,
  -0.06851649,
  0.05227937,
  -0.017964613,
  -0.057612922,
  -0.017648775,
  -0.0390051,
  0.010511926,
  -0.008399615,
  0.012497429,
  -0.015961207,
  -0.022175422,
  -0.075219624,
  0.10532077,
  -

In [22]:
index = pc.Index('rag')
index.upsert(
  vectors=processed_data,
  namespace="ns1"
)

upserted_count: 20

In [23]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}