In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [3]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Dr. Johnson, who teaches Mathematics, explains complex concepts clearly. Her classes are engaging and well-structured. She is an expert in algebraic topology.'},
 {'professor': 'Prof. Robert Smith',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Prof. Smith, a Physics professor, is very knowledgeable, especially in quantum mechanics. His lectures can be fast-paced but are rich in content.'},
 {'professor': 'Dr. Alice Turner',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Dr. Turner teaches Chemistry and specializes in organic chemistry. While she knows her material well, her lectures can be a bit dry. However, her labs are well-organized.'},
 {'professor': 'Prof. Michael Brown',
  'subject': 'History',
  'stars': 2,
  'review': 'Prof. Brown, who teaches History, focuses on medieval European history. Unfortunately, his lectures are mostly reading from slides, which makes them less engag

In [4]:

import google.generativeai as genai
import os

processed_data = []
for review in data['reviews']:
    
    genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
    result = genai.embed_content(
        model="models/text-embedding-004",
        content=review['review'],
        task_type="retrieval_document",
        title="Embedding of single string")
    embeddings = result['embedding']
    
    processed_data.append({
        "values": embeddings,
        "id": review['professor'],
        "metadata":{
            'review': review['review'],
            'subject': review['subject'],
            'stars': review['stars'],
        }
    })

In [5]:
processed_data[0]

{'values': [0.014987577,
  0.010248563,
  -0.054795437,
  -0.011506912,
  0.021747686,
  0.024354875,
  0.031893298,
  0.046343286,
  -0.017709557,
  0.04489177,
  0.040833518,
  0.028754443,
  0.066278115,
  -0.0032015943,
  -0.01199413,
  -0.07370988,
  0.037822057,
  -0.016940836,
  -0.1180251,
  0.045933224,
  -0.00884572,
  -0.037325613,
  0.056964338,
  -0.044190813,
  0.0032162368,
  -0.0052279965,
  0.025560265,
  -0.038012803,
  0.002645363,
  -0.053312503,
  0.054908514,
  0.046713185,
  0.009113188,
  -0.050317783,
  -0.035191696,
  0.030627575,
  -0.008410363,
  -0.014292914,
  0.053475797,
  -0.03729077,
  -0.049446348,
  0.0126630515,
  -0.006573284,
  -0.0049832487,
  -0.07145308,
  -0.04139497,
  -0.011174144,
  0.054158233,
  -0.017003654,
  0.07690859,
  0.0674349,
  0.06097359,
  -0.050201606,
  0.03655783,
  -0.005595378,
  -0.045912836,
  -0.05403575,
  -0.031915445,
  0.022505796,
  -0.035772562,
  -0.0018231908,
  -0.022057164,
  -0.033374563,
  -0.068558834,
  0

In [6]:
index = pc.Index('rag')
index.upsert(
    vectors = processed_data,
    namespace='ns1'
    )

{'upserted_count': 30}

In [7]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 30}},
 'total_vector_count': 30}