In [12]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [9]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1",
    ),
)

In [10]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Great lecturer, very knowledgeable, but the assignments were tough.'},
 {'professor': 'Dr. Emily Carter',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Excellent professor! Made complex topics easy to understand.'},
 {'professor': 'Dr. Rachel Green',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Engaging but lectures could be more organized.'},
 {'professor': 'Dr. Michael Lee',
  'subject': 'Physics',
  'stars': 2,
  'review': 'Very difficult exams and unclear grading criteria.'},
 {'professor': 'Dr. Susan Harris',
  'subject': 'History',
  'stars': 5,
  'review': 'Passionate and knowledgeable about the subject. Highly recommend!'},
 {'professor': 'Dr. William Brown',
  'subject': 'Economics',
  'stars': 4,
  'review': 'Interesting lectures but too fast-paced at times.'},
 {'professor': 'Dr. Sophia Miller',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Good content, but so

In [16]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
  response = client.embeddings.create(
    input=review['review'],
    model="text-embedding-3-small"
  )
  embedding = response.data[0].embedding
  processed_data.append({
    'values': embedding,
    'id': review['professor'],
    'metadata': {
      'review': review['review'],
      'subject': review['subject'],
      'stars': review['stars'],
    }
  })

In [17]:
processed_data[0]

{'values': [-0.024824029,
  -0.019898213,
  -0.025603842,
  -0.026643591,
  0.01326331,
  -0.0056861322,
  -0.015375302,
  -0.013022868,
  0.01442653,
  0.021626798,
  -0.013425771,
  0.012041604,
  -0.040394284,
  -0.041226085,
  0.020756008,
  0.008590934,
  0.0127239395,
  -0.04273372,
  -0.022835508,
  0.04814042,
  0.0046658777,
  -0.014257571,
  0.019391336,
  0.0016189232,
  -0.042473786,
  -0.052533366,
  0.047880486,
  -0.01625909,
  0.02794328,
  0.031374454,
  0.05437892,
  -0.011164315,
  0.011970121,
  0.019937204,
  -0.048790265,
  0.06680393,
  0.016324073,
  0.05562662,
  0.02614971,
  0.022133676,
  0.014959402,
  0.012145579,
  -0.04980402,
  0.0054684347,
  0.011554221,
  -0.03210228,
  -0.026045736,
  -0.027293436,
  0.047906477,
  0.03607932,
  -0.019170389,
  0.053339172,
  0.06747977,
  0.046840735,
  -0.026097722,
  -0.02486302,
  -0.028931042,
  0.02711148,
  -0.0023589327,
  -0.015661234,
  0.02403122,
  -0.0043344577,
  0.03334998,
  -0.019482315,
  -0.036495

In [19]:
index = pc.Index('rag')
index.upsert(
  vectors=processed_data,
  namespace='ns1'
)

{'upserted_count': 20}

In [20]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}