### use this cell for `installations` of the relevant versions

In [1]:
!pip install -qU \
    openai==0.27.7 \
    pandas==2.0.3\
    datasets==2.12.0 \
    pinecone-client==3.2.2 \
    pinecone-datasets==0.7.0 \
    pinecone-notebooks==0.1.1\
    tqdm


### OpenAI GPT-3.5-turbo-instruct Text Completion Script and setting up openAI API

In [18]:
import os
import openai
# get API key from top-right dropdown on OpenAI website
openai.api_key = os.getenv("OPENAI_API_KEY") or 'sk-proj-cns0EYsVd8MwGmNqImmMT3BlbkFJ2nrjtYwMUPXe2lZXAngA'
# openai.Engine.list()

def complete(prompt):
    # query text-davinci-003
    res = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a memory assistant, listening to my conversations."},
                {"role": "user", "content": conversation_text},  # Provide the conversation context
                {"role": "user", "content": prompt}  # User's question
            ]
        )
    response_text = response.choices[0].message.content
    return response_text

#     res = openai.Completion.create(
#         engine='gpt-3.5-turbo-instruct',
#         prompt=prompt,
#         temperature=0,
#         max_tokens=400,
#         top_p=1,
#         frequency_penalty=0,
#         presence_penalty=0,
#         stop=None
#     )
#     return res['choices'][0]['text'].strip()

### Loading, splitting, and reading data from out text file using nltk and then using 'text-embedding-ada-002' for embedding

In [4]:
import os
from tqdm.auto import tqdm
import nltk
import openai
from time import sleep

# Ensure you have the necessary NLTK data files
nltk.download('punkt')

# Set your OpenAI API key
openai.api_key = 'sk-proj-cns0EYsVd8MwGmNqImmMT3BlbkFJ2nrjtYwMUPXe2lZXAngA'

# Function to read text from a file and split it into sentences
def create_sentences(text):
#     with open(file_path, 'r') as file:
#         text = file.read()
    sentences = nltk.sent_tokenize(text)
    return sentences

# Function to create embeddings for a batch of sentences
def create_embeddings(sentences, embed_model='text-embedding-ada-002', batch_size=10):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        try:
            res = openai.Embedding.create(input=batch, engine=embed_model)
            
        except Exception as e:
            print(f"Error creating embeddings for batch {i//batch_size + 1}: {e}")
            # Handle retries or wait times here if needed
            sleep(5)  # Example: retry after 5 seconds
            continue
    return embeddings

# Path to your text file
file_path = 'STT_file.txt'

# Load text data and split into sentences
sentences = load_text_file(file_path)

# Example batch processing and embedding creation
batch_size = 10
embeddings = create_embeddings(sentences, batch_size=batch_size)

# Example: Print embeddings for the first few batches
for i, embedding in enumerate(embeddings[:10]):
    print(f"Embedding {i+1}: {embedding}")

# Placeholder for new merged data (your original processing logic)
new_data = []

window = 20  # number of sentences to combine
stride = 4   # number of sentences to 'stride' over

for i in tqdm(range(0, len(sentences), stride)):
    i_end = min(len(sentences)-1, i+window)
    text = ' '.join(sentences[i:i_end])
    new_data.append({
        'start': i,
        'end': i_end,
        'text': text,
        'id': i,  # Placeholder ID
        'url': 'N/A',
        'published': 'N/A',
        'channel_id': 'N/A'
    })

# Example: Print the first few entries of the new dataset
#for entry in new_data[:5]:
    #print(entry)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/muddassirkhalidi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Embedding 1: {
  "embedding": [
    -0.019712449982762337,
    -0.02957513928413391,
    0.0075230468064546585,
    -0.033246178179979324,
    -0.011155308224260807,
    0.020875807851552963,
    0.011297496035695076,
    -0.0009161442285403609,
    -0.019531484693288803,
    0.010321568697690964,
    0.015020241029560566,
    0.002339641796424985,
    0.005929892882704735,
    -0.01556314155459404,
    0.00799485296010971,
    -0.037124037742614746,
    0.022181354463100433,
    -0.03355640918016434,
    0.017010875046253204,
    -0.05532412230968475,
    0.010121212340891361,
    -0.006191648542881012,
    -0.002654717769473791,
    0.007005998864769936,
    0.0076910872012376785,
    0.0014210738008841872,
    0.003978037275373936,
    -0.0017175684915855527,
    0.006954294163733721,
    -0.034383684396743774,
    0.03091946430504322,
    0.015188281424343586,
    0.0008175819530151784,
    -0.006837958469986916,
    -0.0036678083706647158,
    -0.02304740995168686,
    -0.00321054

  0%|          | 0/29 [00:00<?, ?it/s]

### Setting up Pinecone API and environment 

In [10]:
import os
from pinecone import Pinecone
from pinecone import ServerlessSpec

use_serverless = True

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = '1fdff541-89da-47de-8a43-71d45f8afd2d'
pc = Pinecone(api_key=api_key)

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "memoro"

### Pinecone Index Initialization and Connection Script

In [11]:
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Batch Processing and Upserting Embeddings into Pinecone

In [12]:
from tqdm.auto import tqdm
from time import sleep
import openai

# Set OpenAI API key
openai.api_key = 'sk-proj-cns0EYsVd8MwGmNqImmMT3BlbkFJ2nrjtYwMUPXe2lZXAngA'

# Parameters
batch_size = 100  # Number of embeddings to create and insert at once
embed_model = 'text-embedding-ada-002'  # OpenAI embedding model

# Iterate over new_data in batches and create embeddings
for i in tqdm(range(0, len(new_data), batch_size)):
    # Determine end of current batch
    i_end = min(len(new_data), i + batch_size)
    meta_batch = new_data[i:i_end]
    
    # Extract IDs and texts from meta_batch
    ids_batch = [str(x['id']) for x in meta_batch]  # Ensure id is converted to string
    texts = [x['text'] for x in meta_batch]
    
    # Create embeddings (with retry logic)
    done = False
    while not done:
        try:
            res = openai.Embedding.create(input=texts, engine=embed_model)
            done = True
        except Exception as e:
            print(f"Error creating embeddings for batch {i // batch_size + 1}: {e}")
            sleep(5)  # Wait before retrying
            continue
    
    # Extract embeddings and prepare metadata for upsert
    embeds = [record['embedding'] for record in res['data']]
    meta_batch = [{
        'start': x['start'],
        'end': x['end'],
        'title': x.get('title', 'N/A'),  # Replace with actual metadata fields if available
        'text': x['text'],
        'url': x.get('url', 'N/A'),
        'published': x.get('published', 'N/A'),
        'channel_id': x.get('channel_id', 'N/A')
    } for x in meta_batch]
    
    # Prepare data for upsert into Pinecone
    to_upsert = [(ids_batch[j], embeds[j], meta_batch[j]) for j in range(len(meta_batch))]
    
    # Upsert vectors into Pinecone index
    try:
        index.upsert(vectors=to_upsert)
        print(f"Successfully upserted batch {i // batch_size + 1}")
    except Exception as e:
        print(f"Error upserting vectors for batch {i // batch_size + 1}: {e}")
        # Handle retry or error recovery logic here if needed
query = (
    "What natural disasters are caused by climate change?" 
)


  0%|          | 0/1 [00:00<?, ?it/s]

Successfully upserted batch 1


In [13]:
res = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
xq = res['data'][0]['embedding']

# get relevant contexts (including the questions)
res = index.query(vector=xq, top_k=2, include_metadata=True)
res

{'matches': [{'id': '112',
              'metadata': {'channel_id': 'N/A',
                           'end': 113.0,
                           'published': 'N/A',
                           'start': 112.0,
                           'text': 'Me neither.',
                           'title': 'N/A',
                           'url': 'N/A'},
              'score': 0.728005767,
              'values': []},
             {'id': '80',
              'metadata': {'channel_id': 'N/A',
                           'end': 100.0,
                           'published': 'N/A',
                           'start': 80.0,
                           'text': 'He said it was amazing. Lots of relaxing '
                                   'on the beach and exploring the islands. '
                                   'Sounds like a dream. I should plan a trip '
                                   'there sometime. You totally should. It’s '
                                   'on my bucket list too. We should all g

### Retrieving and Constructing a Prompt with Context from Pinecone 

In [19]:
limit = 3750

import os
import openai
# get API key from top-right dropdown on OpenAI website
openai.api_key = os.getenv("OPENAI_API_KEY") or 'sk-proj-cns0EYsVd8MwGmNqImmMT3BlbkFJ2nrjtYwMUPXe2lZXAngA'
# openai.Engine.list()

def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='gpt-3.5-turbo-instruct',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()

def retrieve(query):
    res = openai.Embedding.create(
        input=[query],
        engine=embed_model
    )

    # retrieve from Pinecone
    xq = res['data'][0]['embedding']

    # get relevant contexts
    res = index.query(vector=xq, top_k=3, include_metadata=True)
    contexts = [
        x['metadata']['text'] for x in res['matches']
    ]

    # build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i-1]) +
                prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts) +
                prompt_end
            )
    return prompt


### Prompting

In [20]:
query = (
    "Who is the head of AIDA" 
)
query_with_contexts = retrieve(query)
query_with_contexts   # provides context


'Answer the question based on the context below.\n\nContext:\nDo you have any big projects coming up? We’re working on a new campaign for the fall. Lots of planning and strategizing right now. Sounds like you’ve got your hands full. Definitely. But I like staying busy. Yeah, it keeps things interesting. By the way, did you hear about Sarah’s promotion? No, I didn’t. That’s awesome! What’s her new position? She’s now the head of the sales department. Wow, that’s amazing. She’s been working so hard. She totally deserves it. I know, right? We should all go out and celebrate soon. Absolutely. I’ll text her and see when she’s free. Good idea.\n\n---\n\nWhat’s the new role? I’m now the marketing coordinator. It’s a lot more responsibility, but I’m enjoying it so far. Congratulations! That’s a big step up. Thanks! It’s been a bit overwhelming, but I’m learning a lot. That’s great to hear. Do you have any big projects coming up? We’re working on a new campaign for the fall. Lots of planning an

In [21]:
complete(query_with_contexts) # provides answer

'Sarah is the head of the sales department.'