In [None]:
import os
from dotenv import load_dotenv
from pathlib import Path
from tqdm import tqdm
from time import sleep
import pandas as pd
from IPython.display import Markdown

import openai
import pinecone

load_dotenv(Path('../.env')) # Load Keys

In [None]:
# Initialize openai embedding model
openai.api_key = os.getenv('OPENAI_API_KEY')

embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)

In [None]:
# Check output and its shape
res

In [None]:
## Load in data
df = pd.read_csv('/Users/tania/not-legal-advice/data/processed/ACT_law.csv')
df.head()

In [None]:
# Initialize pinecone database

index_name = 'test'

# initialize connection to pinecone
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  # app.pinecone.io (console)
    environment=os.getenv('PINECONE_ENVIRONMENT')  # next to API key in console
)

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine' #'dotproduct'
    )
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

In [None]:
# Embed stuff

# text_fp = ['/Users/tania/not-legal-advice/data/processed/example_1.txt', '/Users/tania/not-legal-advice/data/processed/example_2.txt', '/Users/tania/not-legal-advice/data/processed/example_3.txt']

# for file in tqdm(text_fp):
#     with open(file, 'r') as f:
#         text = f.read()
#     res = openai.Embedding.create(
#         input=[text], engine=embed_model
#     )

#     upsert_response = index.upsert(
#         vectors=[
#         (os.path.basename(file), res['data'][0]['embedding'], {'text': text}),
#         ]
#         )
#     sleep(1)

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    res = openai.Embedding.create(
        input=[row['Section_text']], engine=embed_model
    )

    text_id = row['Act'] + ": " + row['Section_number']

    upsert_response = index.upsert(
        vectors=[
        (text_id,
         res['data'][0]['embedding'], 
         {
        'text': row['Section_text'], 
        'Act': row['Act'],
        'Section_number': row['Section_number'],
        'Section_title': row['Section_title']}),
        ]
        )
    sleep(1)
    

In [None]:
# Retrieve Stuff
query = "who is the ATSIEB?"

res = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
xq = res['data'][0]['embedding']

# get relevant contexts (including the questions)
res = index.query(xq, top_k=5, include_metadata=True) 

In [None]:
# get list of retrieved text
contexts = [item['metadata']['text'] for item in res['matches']]

augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

print(augmented_query)

In [None]:
contexts = [str(item['metadata']['Act'] +": "+ item['metadata']['Section_number']) for item in res['matches']]
print(contexts)

In [None]:
# system message to 'prime' the model
primer = f"""You are Q&A bot. A highly intelligent system that answers
user questions based on the information provided by the user above
each question. If the information can not be found in the information
provided by the user you truthfully say "I don't know".
"""

res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": augmented_query}
    ]
)

In [None]:
display(Markdown(res['choices'][0]['message']['content']))