In [1]:
!pip install pdfplumber pytesseract langchain ollama chromadb gradio
!pip install langchain
!pip install langchain_community

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting ollama
  Downloading ollama-0.4.4-py3-none-any.whl.metadata (4.7 kB)
Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting gradio
  Downloading gradio-5.9.0-py3-none-any.whl.metadata (16 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [2]:
import gradio as gr
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import pdfplumber
import pytesseract
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import os

In [5]:
# Hugging Face Token 설정
os.environ["HF_TOKEN"] = "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"  # Hugging Face에서 발급받은 토큰 입력
login(token=os.environ["HF_TOKEN"])

# Hugging Face Llama 모델 로드
def setup_llama_model():
    model_name = "meta-llama/Llama-2-7b-hf"  # Hugging Face 모델 이름
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=os.environ["HF_TOKEN"])
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=os.environ["HF_TOKEN"])
    return tokenizer, model

tokenizer, model = setup_llama_model()


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [6]:
# PDF page에서 텍스트 추출
def extract_text_with_ocr(page):
    text = page.extract_text()
    if not text:  # 만약 추출할 텍스트가 없다면
        # PDF page를 이미지로 변환
        image = page.to_image()
        # 이미지에서 OCR 재실행하여 텍스트 추출
        text = pytesseract.image_to_string(image)
    return text

# PDF 파일을 열어서 extract_text_with_ocr 함수 실행 -> 벡터 데이터베이스에 저장하는 함수 작성
def load_and_retrieve_docs(file):
    text = ""
    try:
        with pdfplumber.open(file) as pdf:
            for page in pdf.pages:
                page_text = extract_text_with_ocr(page)
                # 페이지에서 추출한 텍스트가 있을 때마다 text에 누적해서 저장
                if page_text:
                    text += page_text
    except Exception as e:
        return f"Error reading PDF file: {e}"

    # 만약 추출한 텍스트가 하나도 없는 경우 아래와 같은 메세지 출력하고 함수 종료
    if not text:
        return "No text found in the PDF file."

    # 추출한 텍스트가 있는 경우
    docs = [Document(page_content=text)]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
    return vectorstore.as_retriever()


In [7]:
# 리스트 안의 모든 document 객체 내용을 추출해서 string으로 이어붙여 반환
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG 체인 함수
def rag_chain(file, question):
    retriever = load_and_retrieve_docs(file)
    if isinstance(retriever, str):  # 리턴받은 값이 string인 경우 에러를 의미하므로 함수 중단
        return retriever

    retrieved_docs = retriever.get_relevant_documents(question)  # Use get_relevant_documents method
    formatted_context = format_docs(retrieved_docs)
    formatted_prompt = f"질문: {question}\n\n컨텍스트: {formatted_context}\n\n답변:"

    inputs = tokenizer(formatted_prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=200)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

In [8]:
# Gradio interface
iface = gr.Interface(
    fn=rag_chain,
    inputs=["file", "text"],
    outputs="text",
    title="[Llama 2] RAG 검색 활용 챗봇 시스템",
    description="PDF파일을 업로드하고 질문을 입력하면 답변을 생성해 드립니다."
)

# app 실행
iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c66740d36edddb839d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


