In [6]:
!pip install langchain pypdf unstructured pdf2image docx2txt pdfminer



In [7]:
pip install -U langchain-community



In [8]:
!pip install pdfminer.six



In [9]:
pip install faiss-cpu



In [10]:
!pip install chromadb tiktoken transformers sentence_transformers openai langchain pypdf



In [36]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
import os
my_api_key = os.environ["OPENAI_API_KEY"] = "MY_API_KEY"

# PDF 로더 생성
loader = PyPDFLoader("태깅_AI.pdf")

# 텍스트 분할기 생성
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)

# 임베딩 모델 생성
embeddings = OpenAIEmbeddings(openai_api_key= my_api_key)

# Vectorstore 생성
documents = loader.load_and_split()
texts = text_splitter.split_documents(documents)
vectorstore = FAISS.from_documents(texts, embeddings)

# OpenAI 언어 모델 생성
llm = OpenAI(openai_api_key= my_api_key)

# RetrievalQA 체인 생성
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,  # OpenAI LLM 객체 설정
    retriever=vectorstore.as_retriever(),
    chain_type="stuff"  # chain_type을 'stuff'로 설정. 필요에 따라 조정 가능
)

# 프롬프트 템플릿 정의
prompt_template = PromptTemplate(
    template="""You are an AI system that tags documents based on their content. Given the document text and a list of tags, suggest the most relevant tags with their similarity scores.

    Document: {document}

    Provide the tags in descending order of similarity.""",
    input_variables=["document", "tags"]
)

# 쿼리 정의
document_text = "할머니가 버스승차권발급기를 만지지 못하신다"
# 첫 번째 쿼리 수행: 태그 추천
query1 = prompt_template.format(document=document_text)
response1 = qa_chain.run(query1)
document_text2=f"{response1} 가장 유사도가 높은 태그1개만 출력하라"
query2 = prompt_template.format(document=document_text2)
response2 = qa_chain.run(query2)

# 응답 출력
print("태그 추천:", response1)
print("가장 유사한 태그:", response2)


태그 추천: 
1. Elderly Support (0.9)
2. Public Transportation (0.8)
3. Technology Education (0.7)
4. Convenience (0.6)
5. Language and Communication (0.5)
가장 유사한 태그: 
1. Elderly Support - 0.9
