# Ingestion

In [None]:
from dotenv import load_dotenv
import os

# import environment variables
load_dotenv('../.env')

## Load text

Let's use simple pdf document loader `PDFMinerLoader`

In [None]:
from langchain.document_loaders import PDFMinerLoader

In [None]:
climate_pdf = r'../raw data/2022-climate-report-en.pdf'
annual_pdf = r'../raw data/2022-annual-report-en.pdf'
sustainability_pdf = r'../raw data/2022-report-on-sustainability-en.pdf'

In [None]:
data_list = []

for report in [climate_pdf,annual_pdf,sustainability_pdf]:
    loader = PDFMinerLoader(report)
    data_list.append( loader.load() )

In [None]:
data_list

### Reading in hard facts

In [None]:
# This is a long document we can split up.
txts = []

for txt_name in ['environment', 'governance', 'social']:
    with open(r'D:\python_projects\suncor_OP\hard_facts\{}.txt'.format(txt_name)) as f:
        txts.append( f.read() )

In [None]:
txts

## Split text

Splitting text by `TiktokenTextSplitter`, which is based on OpenAI's ADA tokenizer

In [None]:
from langchain.text_splitter import TokenTextSplitter

In [None]:
TOKENIZER = 'cl100k_base'
CHUNK_SIZE = 800

text_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=100, encoding_name=TOKENIZER)
# texts = text_splitter.split_text(text)

In [None]:
from langchain.text_splitter import CharacterTextSplitter

small_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=20, encoding_name=TOKENIZER)
hard_fact_texts = []
for txt in txts:
    hard_fact_texts.append( small_text_splitter.split_text(txt) )

In [None]:
hard_fact_texts

### batch text splitting

In [None]:
additional_docs_list = []

for data in data_list:
    additional_docs_list.append( text_splitter.split_documents(data) )

In [None]:
for doc in additional_docs_list:
    print(f'Total Number of Chunks: {len(doc)}, at {CHUNK_SIZE} max token input.')

### Custom Splitter at page number

In [None]:
# from langchain.text_splitter import TextSplitter
# import re

# pdf_full_text = data[0].page_content

# # Define regular expression to match the template string and the following number
# regex = re.compile(r"Suncor Energy Inc\.   \|   Climate Report 2022   \|   (\d+)")

# # Split the input string at the regex pattern
# split_str = regex.split(pdf_full_text)

# page_nums = [1]
# page_cont = []
# for idx, str in enumerate(split_str):
#     if idx % 2 == 0:
#         page_cont.append(str)
#     else:
#         page_nums.append(int(str))

# dict(zip(page_nums, page_cont))

# # text_splitter = TextSplitter(chunk_size=4000, chunk_overlap=0)

# # Custom text splitter at the page
# # docs = text_splitter.create_documents(texts=page_cont, metadatas=[{'page':page_nums}])

## Create embeddings

Using OpenAI's embedding's model `text-embedding-ada-002`

In [None]:
# import openai

# MODEL = "text-embedding-ada-002"

# openai.api_key = os.getenv('OPENAI_API_KEY')
# # get API key from top-right dropdown on OpenAI website

# # input is a list of strings, exactly like texts
# res = openai.Embedding.create(
#     input=texts, 
#     engine=MODEL
# )

# # extract embeddings to a list
# embeds = [record['embedding'] for record in res['data']]

## Initialize Pinecone vectorstore

Initializing vectorstore is same as connecting to a Pinecone index

In [None]:
import pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENV')  # find next to API key in console
)

# connect to index
index = pinecone.Index(os.getenv('PINECONE_INDEX'))

### Deleting an index

In [None]:
# pinecone.delete_index(os.getenv('PINECONE_INDEX'))

# # delete all from index
# index.delete(delete_all=True, namespace='suncor')

## Populate Pinecone vectorstore

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

MODEL = "text-embedding-ada-002"

index_name = os.getenv('PINECONE_INDEX')

embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'), model=MODEL) # add api key and model if using langchain's wrapper of openai

# pine_vector_store = Pinecone.from_documents(docs, embeddings, index_name=index_name, namespace='suncor')

In [None]:
hard_fact_texts

In [None]:
for txt_list in hard_fact_texts:
    Pinecone.from_texts(
        txt_list, 
        embedding=embeddings, 
        index_name=index_name, 
        namespace='suncor'
    )

### Populating additional docs

In [None]:
for doc in additional_docs_list:
    Pinecone.from_documents(doc, embeddings, index_name=index_name, namespace='suncor')

# Querying

In [1]:
from dotenv import load_dotenv

# import environment variables
load_dotenv('../.env')

True

## Connect to existing Pinecone index

Using `from_existing_index`

In [2]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import os
import pinecone


MODEL = "text-embedding-ada-002"

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENV')  # find next to API key in console
)

pine_vector_store = Pinecone.from_existing_index(os.getenv('PINECONE_INDEX'), OpenAIEmbeddings(model=MODEL))

  from tqdm.autonotebook import tqdm


## Create chain using pinecone's vectorstore

### Question answering with Source

In [10]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI

QUERY = "What color is a firetruck in Canada?"

similar_docs_with_score = pine_vector_store.similarity_search_with_score(QUERY, namespace='suncor')

similar_docs_only = tuple(doc for doc, sim_score in similar_docs_with_score)
similarity_scores = tuple(sim_score for doc, sim_score in similar_docs_with_score)

In [11]:
from langchain.prompts import PromptTemplate


MODEL = 'gpt-3.5-turbo'

PROMPT_TEMPLATE = '''You are a helpful AI assistant. Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say you don't know. DO NOT try to make up an answer.
If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.

{summaries}

Question: {question}
Helpful answer:'''

PROMPT = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["summaries", "question"])

DOCUMENT_PROMPT = PromptTemplate(
    template="Content: {page_content}",
    input_variables=["page_content"],
)

chain = load_qa_with_sources_chain(
    OpenAI(temperature=0, model_name=MODEL), 
    chain_type="stuff", 
    prompt=PROMPT,
    document_prompt=DOCUMENT_PROMPT
)

resp = chain({"input_documents": similar_docs_only, "question": QUERY})



In [12]:
resp['output_text']

"I'm sorry, but I am tuned to only answer questions related to the provided context. I do not have information on the color of firetrucks in Canada."

### Retrieval Question/Answering

In [None]:
from langchain.chains import RetrievalQA

In [None]:
pine_vector_store = Pinecone.from_existing_index(os.getenv('PINECONE_INDEX'), OpenAIEmbeddings(model=MODEL))

In [None]:
pine_vector_store

In [None]:
from langchain.llms import OpenAI

MODEL = 'gpt-3.5-turbo'

qa = RetrievalQA.from_chain_type(
    llm=OpenAI(model_name=MODEL, temperature=0), 
    chain_type="stuff",
    retriever=pine_vector_store.as_retriever()
)

In [None]:
query = "What is SAGD?"
qa.run(query)