In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings 
from langchain.text_splitter import CharacterTextSplitter 
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate
from langchain import OpenAI, ConversationChain

from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Import PyPDF2
import PyPDF2
import numpy as np 

In [2]:
import os 
# Load openai api key 
openai_api_key = os.environ.get('OPENAI_API_KEY')

# Load Pinecone API key 
pinecone_api_key = os.environ.get('PINECONE_API_KEY')

# Load pinecode api env 
pinecone_api_env = os.environ.get('PINECONE_API_ENV')

In [3]:
# Load openai api key 
openai_api_key = os.environ.get('OPENAI_API_KEY')
# Load Pinecone API key 
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
# Load pinecode api env 
pinecone_api_env = os.environ.get('PINECONE_API_ENV')

# Extract text from the PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfFileReader(file)
        text = ''
        for page_num in range(reader.numPages):
            text += reader.getPage(page_num).extractText()
    return text

def get_all_pdf_files_in_directory(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]

In [4]:
import pinecone

# Initialize a Pinecone client

pinecone.init(api_key=pinecone_api_key, api_env=pinecone_api_env)
#help(pinecone.init)

  from tqdm.autonotebook import tqdm


In [5]:
# Pick a name for the new index
index_name = 'semantic-text-search'

In [6]:
import pandas as pd
import numpy as np
import time
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import sqlite3

pd.set_option('display.max_colwidth', 200)

In [7]:
import os
import PyPDF2
import pinecone
from sentence_transformers import SentenceTransformer

In [8]:
# Generate vector embeddings
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [9]:
data_directory = "data/"
pdf_files = get_all_pdf_files_in_directory(data_directory)

In [10]:
bnr_report_reader = UnstructuredPDFLoader("data/Annual_Report_2021_22_Web_English_Versio.pdf")
bnr_report_reader_data = bnr_report_reader.load()

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
# conversation = ConversationChain(llm=llm, verbose=True)

In [11]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/Annual_Report_2021_22_Web_English_Versio.pdf")
pages = loader.load_and_split()

In [None]:
# Pick a name for the new index
simple_index_name = 'stocks-trends'

In [12]:
# Check whether the index with the same name already exists
if simple_index_name in pinecone.list_indexes():
    pinecone.delete_index(simple_index_name)

In [13]:
# Create a new index
pinecone.create_index(name=simple_index_name, dimension=128)

In [14]:
# Define nlp
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_named_entities(text_batch):
    # extract named entities using the NER pipeline
    extracted_batch = nlp(text_batch)
    entities = []
    # loop through the results and only select the entity names
    for text in extracted_batch:
        ne = [entity["word"] for entity in text]
        entities.append(ne)
    return entities

In [15]:
pinecone.list_indexes()

In [None]:
pinecone.describe_index("pinecone-index")

In [None]:
pinecone.create_index("example-index", dimension=128, metric="euclidean", pods=4, pod_type="s1.x1")

In [None]:
pinecone.create_index("example-index", dimension=128, source_collection="example-collection")

In [None]:
pinecone.configure_index("my_index", pod_type="s1.x2")

In [None]:
pinecone.describe_index("example-index")

In [None]:
pinecone.configure_index("example-index", replicas=4)

In [16]:
from pprint import pprint
# Define retriever
retriever = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Define index
index = pinecone.Index()

def search_pinecone(query):
    # extract named entities from the query
    ne = extract_named_entities([query])[0]
    # create embeddings for the query
    xq = retriever.encode(query).tolist()
    # query the pinecone index while applying named entity filter
    xc = index.query(xq, top_k=10, include_metadata=True, filter={"named_entities": {"$in": ne}})
    # extract article titles from the search result
    r = [x["metadata"]["title"] for x in xc["matches"]]
    return pprint({"Extracted Named Entities": ne, "Result": r})

In [17]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings(openai_api_key=openai_api_key))
docs = faiss_index.similarity_search("What are the top economic challenges facing Rwanda?", k=2)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content)

146: NBR Annual Report | 2021 - 2022147
 # Policy Paper Objective Findings Recommendations
1Monitoring the 
Economic Impact 
of COVID-19To describe how various 
high-frequency indicators, 
as well as the computation 
of weekly index of economic 
activity, were used in Rwanda 
to monitor the economic 
impact of COVID-19. The pandemic has substantially 
weakened economic performance 
through demand and supply shocks 
and affected all sectors, especially 
the manufacturing and service 
sectors.Proposed various measures to 
recover back to the pre-pandemic 
level successfully.
2Monetary policy, 
credit growth, and 
Economic activity 
in RwandaTo provide a deeper analysis 
of the effect of monetary 
policy on target variables, 
such as credit, output, and 
inflation in Rwanda(1) A positive shock on the 
interbank reduces the credit by one 
percentage point, and positive credit 
shocks allow agents to spend more, 
thus a short but positive reaction for 
the Gross Domestic Product (GDP). 
(2)

In [18]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(pages)
print (f'Now you have {len(texts)} documents')

Now you have 494 documents


In [19]:
import pinecone

pinecone.init(api_key=pinecone_api_key)

pinecone.create_index("national_bank_index", dimension=1024)

In [None]:
pinecone.init(api_key="YOUR_API_KEY", environment="YOUR_ENVIRONMENT")
index = pinecone.Index("example-index")

index_stats_response = index.describe_index_stats()

In [20]:
import chromadb
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()

# create a new index 
index = client.create_index("bnr_index")

# add documents to the index
for text in texts:
    index.add_document(text)

# search for similar documents  
results = index.search("What are the top economic challenges facing Rwanda?", k=2)
for result in results:
    print(result)

Using embedded DuckDB without persistence: data will be transient


In [None]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

In [None]:
import pinecone

pinecone.init(api_key=pinecone_api_key)
index = pinecone.Index("example-index")

In [None]:
metadata_config = {'indexed': ['color']}

In [None]:
# Use Pinecone to implement a vector store 
pinecone_index = Pinecone(index_name="example-index", metadata_config=metadata_config)

# Use OpenAI to implement an embedding model
openai_embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Use Weaviate to implement a vector store  

weaviate_index = Weaviate("http://localhost:8080", "bnr_index")

# Use FAISS to implement a vector store
faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings(openai_api_key=openai_api_key))

In [21]:

# Use approximate nearest neighbor search to find similar documents
docs = faiss_index.similarity_search("What are the top economic challenges facing Rwanda?", k=2)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content)

146: NBR Annual Report | 2021 - 2022147
 # Policy Paper Objective Findings Recommendations
1Monitoring the 
Economic Impact 
of COVID-19To describe how various 
high-frequency indicators, 
as well as the computation 
of weekly index of economic 
activity, were used in Rwanda 
to monitor the economic 
impact of COVID-19. The pandemic has substantially 
weakened economic performance 
through demand and supply shocks 
and affected all sectors, especially 
the manufacturing and service 
sectors.Proposed various measures to 
recover back to the pre-pandemic 
level successfully.
2Monetary policy, 
credit growth, and 
Economic activity 
in RwandaTo provide a deeper analysis 
of the effect of monetary 
policy on target variables, 
such as credit, output, and 
inflation in Rwanda(1) A positive shock on the 
interbank reduces the credit by one 
percentage point, and positive credit 
shocks allow agents to spend more, 
thus a short but positive reaction for 
the Gross Domestic Product (GDP). 
(2)