# 01. Pinecone Generative Question-Answering

reference: [Generative Question-Answering with Long-Term Memory](https://www.pinecone.io/learn/openai-gen-qa/)

In [1]:
import os
import pandas as pd
import openai
import pinecone

from dotenv import load_dotenv
from IPython.core.display import Markdown

  from tqdm.autonotebook import tqdm


## Configure Environment

reference: [Using .env Files for Environment Variables in Python Applications](https://dev.to/jakewitcher/using-env-files-for-environment-variables-in-python-applications-55a1)

In [2]:
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_CHAT_MODEL = os.getenv('OPENAI_CHAT_MODEL', 'gpt-3.5-turbo')
OPENAI_EMBED_MODEL = os.getenv('OPENAI_EMBED_MODEL', 'text-embedding-ada-002')

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_INDEX = os.getenv('PINECONE_INDEX', 'openai-dsm100-2022-oct-transcriptions')
PINECONE_ENV = os.getenv('PINECONE_ENV', 'us-east1-gcp')

In [4]:
# configure openai
openai.api_key = OPENAI_API_KEY

In [5]:
## initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

## Test Chat Completions

### References

- [Chat completions](https://platform.openai.com/docs/guides/chat)
- [How to format inputs to ChatGPT models](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb)

In [6]:
response = openai.ChatCompletion.create(
    model=OPENAI_CHAT_MODEL,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Knock knock."},
        {"role": "assistant", "content": "Who's there?"},
        {"role": "user", "content": "Orange."},
    ],
    temperature=0,
)

response

<OpenAIObject chat.completion id=chatcmpl-6pZcqvNta8vpCFPtrRRhJ9xclT6U9 at 0xffff6d822b80> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Orange who?",
        "role": "assistant"
      }
    }
  ],
  "created": 1677749692,
  "id": "chatcmpl-6pZcqvNta8vpCFPtrRRhJ9xclT6U9",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 5,
    "prompt_tokens": 38,
    "total_tokens": 43
  }
}

In [7]:
response['choices'][0]['message']['content']

'Orange who?'

## Load Data

In [8]:
df_source = pd.read_csv('data/output/transcripts.csv')

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head(3))

(140, 5)


Unnamed: 0,course,topic,title,url,transcript
0,DSM100-2022-OCT,Module information,Module introduction video,https://learn.london.ac.uk/mod/page/view.php?i...,-Welcome to AI Module. Artificial intelligence...
1,DSM100-2022-OCT,Module information,Meet the team,https://learn.london.ac.uk/mod/page/view.php?i...,[music]-Welcome to the AI module. My name is L...
2,DSM100-2022-OCT,Topic 1: Introduction,Lecture: Introduction to Topic 1,https://learn.london.ac.uk/mod/page/view.php?i...,"Welcome to topic one, Introduction to AI. In t..."


## Test Embedding

In [9]:
transcript = df_source.iloc[43].transcript

res = openai.Embedding.create(
    input=[transcript], engine=OPENAI_EMBED_MODEL
)

In [10]:
len(res['data'][0]['embedding'])

1536

# Split Transcript

In [11]:
def split_transcript(text: str, window: int = 20, stride:int = 4) -> list:
    """
    Split transcript into parts equal to the window size.

    Parameters
    ----------
    text : str
        The transcript to split.

    window : int, optional
        The number of sentences to combine into a single part, by default 20

    stride : int, optional
        The number of sentences to 'stride' over, used to create overlap.
    """
    sentences = text.split('.')

    parts = []
    for i in range(0, len(sentences), stride):
        part = '. '.join(sentences[i:i+window])
        parts.append(part)

    return parts

# # test the function
# transcript = df_source.iloc[43].transcript
# split = split_transcript(transcript, window=4, stride=2)

# print(len(split))
# display(Markdown('# Original'))
# display(Markdown(transcript))

# display(Markdown('# split[0]'))
# display(Markdown(split[0]))

# display(Markdown('# split[1]'))
# display(Markdown(split[1]))

# display(Markdown('# split[-1]'))
# display(Markdown(split[-2]))

# display(Markdown('# split[-1]'))
# display(Markdown(split[-1]))

## Create Vector Database

In [12]:
def create_vector_database(index_name: str, dimension: int, metadata_config: dict = None):
    """
    Create a vector database in pinecone.

    Parameters
    ----------
    index_name : str
        The name of the index to create.

    dimension : int
        The dimension of the vectors to be stored in the index.

    metadata_config : dict, optional
        The metadata configuration for the index, by default None
    """
    # check if index already exists (it shouldn't if this is first time)
    if index_name not in pinecone.list_indexes():
        # if does not exist, create index
        pinecone.create_index(
            index_name,
            dimension=dimension,
            metric='cosine',
            metadata_config=metadata_config
        )
    # connect to index
    index = pinecone.Index(index_name)
    
    # view index stats
    print(index.describe_index_stats())

In [13]:
# create the pinecone index
index_dimension = 1536
metadata_config = {
    'indexed': ['topic', 'title']
}

create_vector_database(PINECONE_INDEX, index_dimension, metadata_config)

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
