In [6]:
#!pip install langchain
#!pip install unstructured[pdf]
#!pip install --upgrade openai
#!pip install tiktoken

Restart the kernel.

In [1]:
import os
import openai
from dotenv import load_dotenv

from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import AzureSearch

from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

In [2]:
# Load environment variables
load_dotenv()

True

In [6]:
# Configure OpenAI API
openai.api_type = "azure"
openai.api_base = os.getenv('OPENAI_API_BASE')
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.api_version = os.getenv('OPENAI_API_VERSION')

# Initialize gpt-35-turbo and our embedding model
llm = AzureChatOpenAI(deployment_name="gpt-35-turbo")
embeddings = OpenAIEmbeddings(deployment_id="text-embedding-ada-002", chunk_size=1)

# Connect to Azure Cognitive Search
acs = AzureSearch(azure_search_endpoint=os.getenv('AZURE_SEARCH_SERVICE_ENDPOINT'),
                 azure_search_key=os.getenv('AZURE_SEARCH_ADMIN_KEY'),
                 index_name="openai-learning-days-mb-index",
                 embedding_function=embeddings.embed_query)

                    deployment_id was transferred to model_kwargs.
                    Please confirm that deployment_id is what you intended.


In [4]:
# Load PDF files from "data" folder. This may take few minutes. Recommended to start with 2-3 PDF files.
directory = "./data"

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
print(len(documents))

2


In [5]:
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

2190


In [7]:
# Add documents to Azure Search
acs.add_documents(documents=docs)

['M2M0OGVjZTEtMzExZS00YTI4LWFhM2QtYzk1ZmQzNTMxMzJl',
 'NDI4ZmNmNjQtOGQ0Ny00ZTVhLTljYjgtMDc2OWVkODNiM2Yw',
 'OTFlN2JmZWMtMzliNS00MDEyLWFlYWQtN2Q2YWQ4ZDZjY2Ji',
 'MWZhMzg2NzctYjA4MC00Y2M2LTk5YzItMTBhM2FlNWM1MWUy',
 'YmE4Y2NhMDUtNDYxNC00ZjRhLWJlOGMtNTlkODA4YmVjYWVh',
 'YWQwNTVjNzYtMTQyNC00OGNhLTk5ZDgtZWRkNDIxOGYxZWMy',
 'MTVjNWEwZTktZjI5ZS00N2QwLTkwNTQtOGEyZDMwMDUyYTkw',
 'YTQ2OGI1Y2ItYjQ5Mi00ZjQ5LWExNmMtNmYyMTkwODBjMzRl',
 'Yzc0ZGU5NGMtZDRjNS00MzI0LWJhMWEtZDgzZDdhMWQzNmQ5',
 'OWUyMzZlNGEtNDVhYS00NDcyLTlhNzctMmY1ODQ0MjE2MTc3',
 'OGU3MDIxNDEtNTJhNC00ZGY3LWFkOTYtNWNjZmFjZTkxMjBm',
 'NzlmNzQ4YjktZTlhOS00YWViLWFiZWYtMzliNWI5MTk1YTA3',
 'OWEzN2UyYzYtMDRmYi00YTFhLWEwYzAtNmNjMGQ1NjA1MjNm',
 'Mzc0YWU5ZTctYmFhZi00ODBhLWE2M2EtZmVkNzhiZjc1NDUz',
 'MDQxMjA5NDgtYjVjMS00NDk0LTg3ZmMtMGVkZGZhYzBmOTlm',
 'NzJlYjI3YzUtYWQwZC00NmJkLWExNzMtNTdlMTlkNGIyZGFh',
 'MTVhZTFhZjAtZGJiOS00ZGM1LWEyMzMtZTliZjg1NzM0MzI1',
 'NTU0NzhhYjktZDY5ZS00Y2QyLWExNjUtZDYyMDhiMzQyYWU0',
 'NDA3YjhkYjctNjkwMi00ZjQ1LWFkNjMtMjhmYTlkN2Vl