In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
import pinecone

pinecone.init(
    api_key=os.getenv('PINECONEKEY'),
    environment="us-east1-gcp"
)

index = pinecone.Index("chatrpi")

In [None]:
import openai

openai.api_key = os.getenv('OPENAIKEY')

In [None]:
MODEL = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=MODEL
)
res['data']

In [None]:
# length of the embedding
len(res['data'][0]['embedding'])

In [None]:
path = "/home/player1/Desktop/RPI/ChatRPI/text/"
files = os.listdir(path)

dataset = []
# iterate over all the files in directory. add every 3-4 paragraphs to the dataset.
# if the file is too big, it will be split into multiple entries
# if the file is too small, add the whole file to the dataset
for file in files:
    with open(path + file, 'r') as f:
        text = f.read()
        paragraphs = text.split("\n\n")
        for i in range(len(paragraphs)):
            paragraphs[i] = paragraphs[i].replace("\n", " ")
            paragraphs[i] = paragraphs[i].replace("\t", " ")
            paragraphs[i] = paragraphs[i].strip(" ")
        # remove empty paragraphs
        paragraphs = [p for p in paragraphs if len(p) > 20]
        if len(paragraphs) > 4:
            for i in range(0, len(paragraphs), 4):
                dataset.append(" ".join(paragraphs[max(0,i-1):i+4]))
        elif len(paragraphs) > 0:
            dataset.append(text)
    


In [None]:
# create embedding for each data using openai api ada 002 model
# and add to pinecone index
from tqdm.auto import tqdm
import time

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(dataset), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(dataset))
    # get batch of lines and IDs
    #lines_batch = new_dataset text from (i to i+batch size)
    lines_batch = [record for record in dataset[i:i_end]]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=MODEL)
    try:
        embeds = [record['embedding'] for record in res['data']]
    except:
        print("invalid request error")
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))
    time.sleep(1)