In [20]:
! pip install -U --quiet langchain-google-genai
! pip install -U --quiet langchain tiktoken pypdf sentence_transformers chromadb

In [15]:
## 관련 라이브러리 다운로드
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableMap
import os

In [16]:
## 1. 로더(PDF시 PDF리더, TEXT시 단순 텍스트)
### 로더는 종류별로 각종 메타데이터(텍스트 이외의 정보 ex)페이지 번호 등)
loader = PyPDFLoader("2024_연세대_수시.pdf")
pages = loader.load_and_split() ## 페이지별로 찢기

In [17]:
## 2. 토크나이저
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) 
texts = text_splitter.split_documents(pages)

In [18]:
# 3. 임베딩기 선정(Open ai 또는 Hugging face)
model_name = "jhgan/ko-sbert-nli" # 3. 임베딩기
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [21]:
## 4. VectorDB(Chroma)
docsearch = Chroma.from_documents(texts, hf)

In [30]:
## 5. 검색기
retriever = docsearch.as_retriever(
                                search_type="mmr",
                                search_kwargs={'k':5, 'fetch_k': 50}) ## k는 검색 유사도 문장 수
retriever.get_relevant_documents("추천형에 대해 알려줘") ## 이 후 chain.invoke question이 들어올 곳

[Document(metadata={'page': 2, 'source': '2024_연세대_수시.pdf'}, page_content='지원자 유의사항 요약  5. ( ) ·······································································································································································18\n세부 전형별 안내< >\n학생부위주 학생부교과전형 추천형 및 학생부종합전형 활동우수형 국제형 기회균형. | [ ] [ , , ]Ⅴ ·······························································19\n학생부위주 학생부교과전형 추천형  _1. | [ ]Ⅴ ·········································································································································20'),
 Document(metadata={'page': 48, 'source': '2024_연세대_수시.pdf'}, page_content='연세대학교 서울캠퍼스\n44 세부 전형별 안내 논술전형 . | Ⅵ제출서류6. \n가 서류제출 안내  . \n제출 기간 전형일정 참조    1) : ‘5. ’ \n제출 방식 방문 택배 또는 등기우편 제출 등기우편은 금    2) : ( ) ( 2024.9.13.( )  소인까지 유효)\n제출 주소 우 서울특별시 서대문구 연세로 연세대학교 동 백주년기념관 층 입학전형실 수시모집 담당자 앞    3) : ( ) 03722 50 201 ( ) 3\n인터넷 원서접수 사이트에서 제출서류 안내 를 참고하여 서류제출용 봉투표지 를 출력 후 봉투에 부착       ‘ ’ ‘ ’※ \n도착 확인 월 연세대학교 입학처 홈페이지  

In [None]:
## 프롬프트
template = """Answer the question as based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

## 6. LLM 선정
os.environ['GOOGLE_API_KEY'] = "gemini-api-key"
gemini = ChatGoogleGenerativeAI(model="gemini-pro", temperature = 0)

## RAG
chain = RunnableMap({
    "context": lambda x: retriever.get_relevant_documents(x['question']),
    "question": lambda x: x['question']
}) | prompt | gemini


Markdown(chain.invoke({'question': "묻고 싶은 문장"}).content)