In [None]:
from datasets import load_dataset

dataset = load_dataset("text", data_dir="/home/player1/Desktop/RPI/ChatRPI/text/", split="train")

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
import pinecone

pinecone.init(
    api_key=os.getenv('PINECONEKEY'),
    environment="us-west1-gcp"
)

index = pinecone.Index("chat-rpi")

In [None]:
import openai

openai.api_key = os.getenv('OPENAIKEY')

In [None]:
MODEL = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=MODEL
)
res['data']

In [None]:
# length of the embedding
len(res['data'][0]['embedding'])

In [None]:
# make a new dataset where each index is one sentence from the original dataset

def make_dataset(dataset):
    new_dataset = []
    for i in range(len(dataset)):
        # split the text into sentences
        sentences = dataset[i]["text"].split(".")
        for sentence in sentences:
            new_dataset.append({"text": sentence})
    return new_dataset

new_dataset = make_dataset(dataset)


In [None]:


# remove all empty data in dataset
new_dataset = list(filter(lambda x: x["text"] != "", new_dataset))

# remove all data with less than 7 words
new_dataset = list(filter(lambda x: len(x["text"].split(" ")) > 7, new_dataset))

len(new_dataset)



In [None]:
# create embedding for each data using openai api ada 002 model
# and add to pinecone index
from tqdm.auto import tqdm

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(new_dataset), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(new_dataset))
    # get batch of lines and IDs
    #lines_batch = new_dataset text from (i to i+batch size)
    lines_batch = [record['text'] for record in new_dataset[i:i_end]]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=MODEL)
    try:
        embeds = [record['embedding'] for record in res['data']]
    except:
        print("invalid request error")
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))