In [19]:
import os 
from dotenv import load_dotenv
load_dotenv
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [20]:
from langchain_community.document_loaders import PyPDFLoader
# PyPDFLoader 외에도 여러가지가 있지만 PyPDF가 페이지별로 메타데이터를 보관함. 
# 페이지 별로 하나의 객체로 변환된 것을 확인할 수 있음. 
loader = PyPDFLoader(rf"C:\Users\82106\pdf-bot\attention_is_all_you_need.pdf")
pages = loader.load()

In [21]:
len(pages)

15

In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 이제부터 옵션 들어간다.
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    # 첫 두가지는 무조건 기억해야함. 
    chunk_size=1000,
    chunk_overlap=200,
    # length_function=len,
    # is_separator_regex=False,
)

# 오버랩이 있으면, 중간에 끊겼더라도 중간에 겹치는 부분이 있어서 괜찮. 

In [23]:
# 이제 분할을 해줄 예정 
splits = text_splitter.split_documents(pages)

len(splits)

52

In [24]:
# 임베딩 모델 가져오기 

from langchain_openai import OpenAIEmbeddings

OpenAIEmbeddings(api_key="OPENAI_API_KEY")

# 벡터 스토어 가져오기 

from langchain_chroma import Chroma

db = Chroma.from_documents(splits, OpenAIEmbeddings())

In [25]:
# 유사도 검색 
similarity_retriever = db.as_retriever(
    search_type = 'similarity',
    search_kwargs = {'k':3} # 유사도 기준 3개를 찾아줌 
)

In [26]:
# MMR 검색 
mmr_retriever = db.as_retriever(
    search_type = 'mmr', # 다양성을 고려한 mmr 서치가 이루어진다. 
    search_kwargs = {'k':3,'fetch_k':10} # 10개의 유사도 찾아서 다양성 3개 고려해서 선택
)

In [27]:
# 리트리버도 여러개가 있는데 벡터스토어 리트리버를 써볼 예정이다. 
retriever = db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x00000257F51B9C40>)

In [28]:
from langchain_core.prompts import ChatPromptTemplate

template = '''Answer the question based only on the following context:
<context>
{context}
</context>

Question: {input}
'''

prompt = ChatPromptTemplate.from_template(template)

In [29]:
# 첫번쨰 프로젝트는 랭체인 언어쓰긴 헀는데. 이번엔 그냥 랭체인 매소드만으로 구현 예정 
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

model = ChatOpenAI(model='gpt-3.5-turbo-0125', temperature = 0, api_key = OPENAI_API_KEY)

document_chain = create_stuff_documents_chain(model, prompt)

retrieval_chain = create_retrieval_chain(similarity_retriever, document_chain)

similarity_reponse  = retrieval_chain.invoke({"input":"why attention matters in transformers? explain to 8 years old. would you say 'attenion' is understaing of 'context'?"})

similarity_reponse 

{'input': "why attention matters in transformers? explain to 8 years old. would you say 'attenion' is understaing of 'context'?",
 'context': [Document(metadata={'page': 0, 'source': 'C:\\Users\\82106\\pdf-bot\\attention_is_all_you_need.pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an 

In [30]:
mmr_retrieval_chain = create_retrieval_chain(mmr_retriever, document_chain)
mmr_response = mmr_retrieval_chain.invoke({'input': 'what is the attention mechanism in transformers?'})
mmr_response

{'input': 'what is the attention mechanism in transformers?',
 'context': [Document(metadata={'page': 0, 'source': 'C:\\Users\\82106\\pdf-bot\\attention_is_all_you_need.pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the 