# 03. Pinecone Generative Question-Answering

reference: [Generative Question-Answering with Long-Term Memory](https://www.pinecone.io/learn/openai-gen-qa/)

In [1]:
import os
import uuid
import pandas as pd
import openai
import pinecone

from tqdm.auto import tqdm
from dotenv import load_dotenv
from IPython.core.display import Markdown

  from tqdm.autonotebook import tqdm


## Configure Environment

reference: [Using .env Files for Environment Variables in Python Applications](https://dev.to/jakewitcher/using-env-files-for-environment-variables-in-python-applications-55a1)

In [2]:
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_CHAT_MODEL = os.getenv('OPENAI_CHAT_MODEL', 'gpt-3.5-turbo')
OPENAI_EMBED_MODEL = os.getenv('OPENAI_EMBED_MODEL', 'text-embedding-ada-002')

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_INDEX = os.getenv('PINECONE_INDEX', 'openai-dsm100-2022-oct-transcriptions')
PINECONE_ENV = os.getenv('PINECONE_ENV', 'us-east1-gcp')

In [4]:
# configure openai
openai.api_key = OPENAI_API_KEY

In [5]:
## initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

## Test Chat Completions

### References

- [Chat completions](https://platform.openai.com/docs/guides/chat)
- [How to format inputs to ChatGPT models](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb)

In [6]:
response = openai.ChatCompletion.create(
    model=OPENAI_CHAT_MODEL,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Knock knock."},
        {"role": "assistant", "content": "Who's there?"},
        {"role": "user", "content": "Orange."},
    ],
    temperature=0,
)

response

<OpenAIObject chat.completion id=chatcmpl-6pd3lfRTNDF7OrBg3epVhKFvN7XKW at 0xffff8750ab30> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Orange who?",
        "role": "assistant"
      }
    }
  ],
  "created": 1677762893,
  "id": "chatcmpl-6pd3lfRTNDF7OrBg3epVhKFvN7XKW",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 5,
    "prompt_tokens": 38,
    "total_tokens": 43
  }
}

In [7]:
response['choices'][0]['message']['content']

'Orange who?'

## Load Data

In [8]:
df_source = pd.read_csv('data/output/transcripts.csv')

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head(3))

(140, 5)


Unnamed: 0,course,topic,title,url,transcript
0,DSM100-2022-OCT,Module information,Module introduction video,https://learn.london.ac.uk/mod/page/view.php?i...,-Welcome to AI Module. Artificial intelligence...
1,DSM100-2022-OCT,Module information,Meet the team,https://learn.london.ac.uk/mod/page/view.php?i...,[music]-Welcome to the AI module. My name is L...
2,DSM100-2022-OCT,Topic 1: Introduction,Lecture: Introduction to Topic 1,https://learn.london.ac.uk/mod/page/view.php?i...,"Welcome to topic one, Introduction to AI. In t..."


## Test Embedding

In [9]:
transcript = df_source.iloc[43].transcript

res = openai.Embedding.create(
    input=[transcript], engine=OPENAI_EMBED_MODEL
)

In [10]:
len(res['data'][0]['embedding'])

1536

# Split Transcript

In [11]:
def split_transcript(text: str, window: int = 20, stride:int = 4) -> list:
    """
    Split transcript into parts equal to the window size.

    Parameters
    ----------
    text : str
        The transcript to split.

    window : int, optional
        The number of sentences to combine into a single part, by default 20

    stride : int, optional
        The number of sentences to 'stride' over, used to create overlap.
    """
    sentences = text.split('.')

    parts = []
    for i in range(0, len(sentences), stride):
        part = '. '.join(sentences[i:i+window])
        parts.append(part)

    return parts

# # test the function
# transcript = df_source.iloc[43].transcript
# split = split_transcript(transcript, window=4, stride=2)

# print(len(split))
# display(Markdown('# Original'))
# display(Markdown(transcript))

# display(Markdown('# split[0]'))
# display(Markdown(split[0]))

# display(Markdown('# split[1]'))
# display(Markdown(split[1]))

# display(Markdown('# split[-1]'))
# display(Markdown(split[-2]))

# display(Markdown('# split[-1]'))
# display(Markdown(split[-1]))

In [12]:
def create_split_dataset(df_source:pd.DataFrame) -> list:
    """
    Create a list of dictionaries containing the split transcript parts.
    """
    # process each row
    dataset = []
    for i, row in tqdm(df_source.iterrows(), total=df_source.shape[0]):
        # split the transcript into parts
        parts = split_transcript(row.transcript)

        # create a dictionary for each part
        for part in parts:
            data = {
                'course': row['course'],
                'topic': row['topic'],
                'title': row['title'],
                'url': row['url'],
                'text': part
            }
            dataset.append(data)

    return dataset

# test the function
split_dataset = create_split_dataset(df_source)
print(len(split_dataset))
split_dataset[3]

  0%|          | 0/140 [00:00<?, ?it/s]

3000


{'course': 'DSM100-2022-OCT',
 'topic': 'Module information',
 'title': 'Module introduction video',
 'url': 'https://learn.london.ac.uk/mod/page/view.php?id=96059&forceview=1',
 'text': " This module will include four theoretical topic sand six topics of more practical nature.  We will focus on concrete AI systems and case studies.  You will have an opportunity to learn how they're built and how they're working.  After the end of this module, you should be able to critically evaluate key issues in agent-based system, knowledge system, robotics, automated reasoning, and problem-solving, represent tasks, environments, and outline strategies for intelligent agents, compare the adequacy and efficiency of different reasoning approaches.  We form a deep researched analysis of a particular artificial intelligence method and their use, apply AI techniques within the context of a substantial research project.  I wish you very best of luck with this module. "}

## Create Vector Database

In [13]:
def create_vector_database(index_name: str, dimension: int, metadata_config: dict = None):
    """
    Create a vector database in pinecone.

    Parameters
    ----------
    index_name : str
        The name of the index to create.

    dimension : int
        The dimension of the vectors to be stored in the index.

    metadata_config : dict, optional
        The metadata configuration for the index, by default None
    """
    # check if index already exists (it shouldn't if this is first time)
    if index_name not in pinecone.list_indexes():
        # if does not exist, create index
        pinecone.create_index(
            index_name,
            dimension=dimension,
            metric='cosine',
            metadata_config=metadata_config
        )
    # connect to index
    index = pinecone.Index(index_name)
    
    # view index stats
    print(index.describe_index_stats())

In [14]:
# create the pinecone index
index_dimension = 1536
metadata_config = {
    'indexed': ['topic', 'title']
}

create_vector_database(PINECONE_INDEX, index_dimension, metadata_config)

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [15]:
v2 = pinecone.Vector(
    id="1",
    values=[0.4, 0.5, 0.6, 0.7], 
    metadata={"color": "red", "shape": "circle"})

print(v2)

{'id': '1',
 'metadata': {'color': 'red', 'shape': 'circle'},
 'values': [0.4, 0.5, 0.6, 0.7]}


In [16]:
def add_vectors_to_index(data: list, index_name: str, batch_size: int = 100):
    """
    Add vectors to the pinecone index.

    Parameters
    ----------
    data : list
        The data to add to the index.

    index_name : str
        The name of the index to add the vectors to.

    batch_size : int, optional
        The number of vectors to add to the index at a time, by default 100
    """
    # connect to index
    index = pinecone.Index(index_name)

    # create batches
    item_id = 0
    batches = [data[i:i+batch_size] for i in range(0, len(data), batch_size)]

    # add vectors to index
    for batch in tqdm(batches):
        # create a list of vectors
        vectors = []
        for data in tqdm(batch):
            # do not handle empty vectors
            if data['text'] == '' or data['text'] == ' ':
                continue

            # get the vector
            res = openai.Embedding.create(
                input=[data['text']], engine=OPENAI_EMBED_MODEL
            )
            vector = res['data'][0]['embedding']

            pine_vector = pinecone.Vector(id=str(item_id), values=vector, metadata=data)
            #print(pine_vector)

            vectors.append(pine_vector)
            item_id += 1

        # add vectors to index
        index.upsert(vectors=vectors)

add_vectors_to_index(
    data=split_dataset,
    index_name=PINECONE_INDEX,
    batch_size=100)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]