### use this cell for `installations` of the relevant versions

In [9]:
!pip install -qU \
    openai==0.27.7 \
    pandas==2.0.3\
    datasets==2.12.0 \
    pinecone-client==3.2.2 \
    pinecone-datasets==0.7.0 \
    pinecone-notebooks==0.1.1\
    tqdm


### OpenAI GPT-3.5-turbo-instruct Text Completion Script and setting up openAI API

In [2]:
import os
import openai
# get API key from top-right dropdown on OpenAI website
openai.api_key = os.getenv("OPENAI_API_KEY") or "sk-proj-cns0EYsVd8MwGmNqImmMT3BlbkFJ2nrjtYwMUPXe2lZXAngA"

openai.Engine.list()

def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='gpt-3.5-turbo-instruct',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()

### Loading, splitting, and reading data from out text file using nltk and then using 'text-embedding-ada-002' for embedding

In [3]:
import os
from tqdm.auto import tqdm
import nltk
import openai
from time import sleep

# Ensure you have the necessary NLTK data files
nltk.download('punkt')

# Set your OpenAI API key
openai.api_key = 'sk-proj-cns0EYsVd8MwGmNqImmMT3BlbkFJ2nrjtYwMUPXe2lZXAngA'

# Function to read text from a file and split it into sentences
def load_text_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    sentences = nltk.sent_tokenize(text)
    return sentences

# Function to create embeddings for a batch of sentences
def create_embeddings(sentences, embed_model='text-embedding-ada-002', batch_size=10):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        try:
            res = openai.Embedding.create(input=batch, engine=embed_model)
            embeddings.extend(res['data'])
        except Exception as e:
            print(f"Error creating embeddings for batch {i//batch_size + 1}: {e}")
            # Handle retries or wait times here if needed
            sleep(5)  # Example: retry after 5 seconds
            continue
    return embeddings

# Path to your text file
file_path = 'text.txt'

# Load text data and split into sentences
sentences = load_text_file(file_path)

# Example batch processing and embedding creation
batch_size = 10
embeddings = create_embeddings(sentences, batch_size=batch_size)

# Example: Print embeddings for the first few batches
for i, embedding in enumerate(embeddings[:10]):
    print(f"Embedding {i+1}: {embedding}")

# Placeholder for new merged data (your original processing logic)
new_data = []

window = 20  # number of sentences to combine
stride = 4   # number of sentences to 'stride' over

for i in tqdm(range(0, len(sentences), stride)):
    i_end = min(len(sentences)-1, i+window)
    text = ' '.join(sentences[i:i_end])
    new_data.append({
        'start': i,
        'end': i_end,
        'text': text,
        'id': i,  # Placeholder ID
        'url': 'N/A',
        'published': 'N/A',
        'channel_id': 'N/A'
    })

# Example: Print the first few entries of the new dataset
for entry in new_data[:5]:
    print(entry)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariyamohiuddin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Embedding 1: {
  "embedding": [
    -0.019710436463356018,
    -0.029578570276498795,
    0.007504431996494532,
    -0.03329849615693092,
    -0.011172691360116005,
    0.020847080275416374,
    0.011301855556666851,
    -0.0009388612816110253,
    -0.019465025514364243,
    0.01039124932140112,
    0.014983031898736954,
    0.0023685460910201073,
    0.005931859835982323,
    -0.015590102411806583,
    0.007969423197209835,
    -0.037147585302591324,
    0.02220330201089382,
    -0.03345349431037903,
    0.017049657180905342,
    -0.055282220244407654,
    0.010061880573630333,
    -0.006309664808213711,
    -0.0026575506199151278,
    0.007052357774227858,
    0.007665887475013733,
    0.0014490593457594514,
    0.003923358395695686,
    -0.001722725690342486,
    0.006994233932346106,
    -0.03443513810634613,
    0.030870212242007256,
    0.015241359360516071,
    0.0007471333956345916,
    -0.006826320663094521,
    -0.003642426570877433,
    -0.02305578626692295,
    -0.003225872

  0%|          | 0/55 [00:00<?, ?it/s]

{'start': 0, 'end': 20, 'text': 'Hey, are you free to catch up later today? Yeah, I think so. What time were you thinking? Maybe around five? I just have a couple of things to finish up before then. Five works for me. Where do you want to meet? How about that new coffee shop on Main Street? I heard they have great pastries. Sounds good to me. I’ve been wanting to try their cappuccino. Nice. So, how’s work been? Oh, you know, busy as always. We’re in the middle of a big project right now. Lots of late nights. That sounds intense. What’s the project about? It’s a new app we’re developing for managing team tasks. It’s supposed to help streamline communication and project management.', 'id': 0, 'url': 'N/A', 'published': 'N/A', 'channel_id': 'N/A'}
{'start': 4, 'end': 24, 'text': 'I just have a couple of things to finish up before then. Five works for me. Where do you want to meet? How about that new coffee shop on Main Street? I heard they have great pastries. Sounds good to me. I’ve been

### Setting up Pinecone API and environment 

In [12]:
import os
from pinecone import Pinecone
from pinecone import ServerlessSpec

use_serverless = True

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or '1fdff541-89da-47de-8a43-71d45f8afd2d'
pc = Pinecone(api_key=api_key)

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "memoro"

### Pinecone Index Initialization and Connection Script

In [14]:
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 55}},
 'total_vector_count': 55}

### Batch Processing and Upserting Embeddings into Pinecone

In [16]:
from tqdm.auto import tqdm
from time import sleep
import openai

# Set OpenAI API key
openai.api_key = 'sk-proj-cns0EYsVd8MwGmNqImmMT3BlbkFJ2nrjtYwMUPXe2lZXAngA'

# Parameters
batch_size = 100  # Number of embeddings to create and insert at once
embed_model = 'text-embedding-ada-002'  # OpenAI embedding model

# Iterate over new_data in batches and create embeddings
for i in tqdm(range(0, len(new_data), batch_size)):
    # Determine end of current batch
    i_end = min(len(new_data), i + batch_size)
    meta_batch = new_data[i:i_end]
    
    # Extract IDs and texts from meta_batch
    ids_batch = [str(x['id']) for x in meta_batch]  # Ensure id is converted to string
    texts = [x['text'] for x in meta_batch]
    
    # Create embeddings (with retry logic)
    done = False
    while not done:
        try:
            res = openai.Embedding.create(input=texts, engine=embed_model)
            done = True
        except Exception as e:
            print(f"Error creating embeddings for batch {i // batch_size + 1}: {e}")
            sleep(5)  # Wait before retrying
            continue
    
    # Extract embeddings and prepare metadata for upsert
    embeds = [record['embedding'] for record in res['data']]
    meta_batch = [{
        'start': x['start'],
        'end': x['end'],
        'title': x.get('title', 'N/A'),  # Replace with actual metadata fields if available
        'text': x['text'],
        'url': x.get('url', 'N/A'),
        'published': x.get('published', 'N/A'),
        'channel_id': x.get('channel_id', 'N/A')
    } for x in meta_batch]
    
    # Prepare data for upsert into Pinecone
    to_upsert = [(ids_batch[j], embeds[j], meta_batch[j]) for j in range(len(meta_batch))]
    
    # Upsert vectors into Pinecone index
    try:
        index.upsert(vectors=to_upsert)
        print(f"Successfully upserted batch {i // batch_size + 1}")
    except Exception as e:
        print(f"Error upserting vectors for batch {i // batch_size + 1}: {e}")
        # Handle retry or error recovery logic here if needed


  0%|          | 0/1 [00:00<?, ?it/s]

Successfully upserted batch 1


In [10]:
res = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
xq = res['data'][0]['embedding']

# get relevant contexts (including the questions)
res = index.query(vector=xq, top_k=2, include_metadata=True)
res

NameError: name 'query' is not defined

### Retrieving and Constructing a Prompt with Context from Pinecone 

In [None]:
limit = 3750

def retrieve(query):
    res = openai.Embedding.create(
        input=[query],
        engine=embed_model
    )

    # retrieve from Pinecone
    xq = res['data'][0]['embedding']

    # get relevant contexts
    res = index.query(vector=xq, top_k=3, include_metadata=True)
    contexts = [
        x['metadata']['text'] for x in res['matches']
    ]

    # build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i-1]) +
                prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts) +
                prompt_end
            )
    return prompt


### Prompting

In [None]:
query = (
    "What natural disasters are caused by climate change?" 
)
query_with_contexts = retrieve(query)
query_with_contexts   # provides context


In [None]:
complete(query_with_contexts) # provides answer