In [None]:
from pymongo import MongoClient
from openai import OpenAI
import pandas as pd
import os
from dotenv import load_dotenv
from pinecone import Pinecone
import uuid

In [None]:
load_dotenv("../config.env",verbose=True)
pc = Pinecone()

index_name = "lore"

index = pc.Index(index_name)

In [None]:
def  get_embeddings(text):
    key = os.environ['OPENAI_KEY']
    client = OpenAI(api_key=key)
    return client.embeddings.create(input = [text], model="text-embedding-3-small").data[0].embedding

In [None]:
csv = pd.read_csv("data.csv", sep=">")

for i in range(0, len(csv)):
    print(csv["title"][i])
    print(csv["url"][i])
    print(csv["chunk_id"][i])
    print(csv["text"][i])
    print("")

In [None]:
def ingest_data_to_pinecone(dataframe, pinecone_index, batch_size=100):
    vectors = []
    for index, row in dataframe.iterrows():
        title = row['title']
        url = row['url']
        text = row['text']
        chunk_id = row['chunk_id']
        embedding = get_embeddings(row['text'])
        
        metadata = {
            'title': title,
            'url': url,
            'text': text,
            'chunk_id': chunk_id
        }
        
        vector = (
            str(uuid.uuid4()),
            embedding,
            metadata
        )
        
        vectors.append(vector)

        if len(vectors) >= batch_size:
            pinecone_index.upsert(vectors=vectors)
            print(f"Uploaded batch of {batch_size} vectors to Pinecone.")
            vectors = []
    
    if vectors:
        pinecone_index.upsert(vectors=vectors)

In [None]:
vectorized_csv = pd.DataFrame(columns=["title", "url", "text", "chunk_id", "embedding"])

ingest_data_to_pinecone(csv, index)