# Kaggle Notebook 기반 DeepSeek-OCR 실습 가이드
DeepSeek-OCR 모델을 사용하여 다음 작업을 수행합니다
1. 일반 텍스트 추론 (LLM inference)
2. 이미지 URL 기반 VQA (Visual Question Answering)
3. 이미지 URL 기반 OCR (문자 인식)
4. Gradio 웹 인터페이스

## 1: 패키지 설치

In [None]:
# 이 셀 실행 후 세션 초기화
!pip install -q transformers==4.57.1 torch einops addict easydict

In [None]:
BASE_DIR = pathlib.Path.cwd()
WORK_DIR = BASE_DIR / "deepseek_assets"
WORK_DIR.mkdir(exist_ok=True)

## 2: 환경 설정 및 임포트

In [None]:
import os
import sys
import pathlib
import tempfile
import requests
from io import BytesIO
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import gradio as gr

## 3: 모델 다운로드 및 로딩

In [None]:
MODEL_ID = "prithivMLmods/DeepSeek-OCR-Latest-BF16.I64" # Kaggle/Colab

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModel.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    use_safetensors=True,
)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    dtype = torch.bfloat16
    print(f"✓ 하드웨어: CUDA GPU ({torch.cuda.get_device_name(0)})")

In [None]:
model = model.eval().to(device=device, dtype=dtype)
print(f"✓ 모델 로드 완료 - 디바이스: {device}, dtype: {dtype}")

## 4: 유틸리티 함수 정의

In [None]:
def download_image_from_url(url: str) -> Image.Image:
    """URL에서 이미지를 다운로드합니다."""
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    image = Image.open(BytesIO(response.content))
    return image.convert("RGB")

def run_inference(prompt: str, image_file: str = None) -> str:
    """DeepSeek-OCR 추론을 실행합니다."""
    
    with tempfile.TemporaryDirectory() as output_dir:
        result = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=image_file,
            output_path=output_dir,
            base_size=1024,
            image_size=640,
            crop_mode=True,
            save_results=False,
            test_compress=False,
            eval_mode=True
        )
    return result

## 5: 예제 1 - VQA(Vision Question-Answering)

In [None]:
image_url_vqa = "https://wpdatatables.com/wp-content/uploads/2020/08/chart5.jpg"

vqa_prompt = "<image>\nWhat information can you extract from this chart? Describe it in detail"

In [None]:
image_vqa = download_image_from_url(image_url_vqa)
temp_image_path = WORK_DIR / "vqa_temp.jpg"
image_vqa.save(temp_image_path)

In [None]:
from IPython.display import display
display(image_vqa)

In [None]:
# VQA 추론 실행

result_vqa = run_inference(vqa_prompt, image_file=str(temp_image_path))

print(f"\n결과:\n{result_vqa}")

# 정리
temp_image_path.unlink()

## 6: 예제 2 - 이미지 URL 기반 OCR

In [None]:
image_url_ocr = "https://img.36krcdn.com/hsossms/20251020/v2_dd7e09a2df204496acf739b55018b0b8@000000_oswg237433oswg1000oswg744_img_000?x-oss-process=image/format,jpg/interlace,1"

In [None]:
image_ocr = download_image_from_url(image_url_ocr)
temp_image_path = WORK_DIR / "ocr_temp.jpg"
image_ocr.save(temp_image_path)

display(image_ocr)

In [None]:
# OCR 프롬프트
ocr_prompt = "<image>\nFree OCR."

# OCR 추론 실행
result_ocr = run_inference(ocr_prompt, image_file=str(temp_image_path))

print(f"\n추출된 텍스트:\n{result_ocr}")

# 정리
temp_image_path.unlink()

## 7: 추가 예제 - 마크다운 변환

In [None]:
md_image_url = "https://sharelatex-wiki-cdn-671420.c.cdn77.org/learn-scripts/images/4/4d/MarkdownExample.png"

In [None]:
md_ocr = download_image_from_url(md_image_url)
temp_image_path = WORK_DIR / "md_temp.jpg"
md_ocr.save(temp_image_path)

display(md_ocr)

In [None]:
markdown_prompt = "<image>\n<|grounding|>Convert the document to markdown."

result_markdown = run_inference(markdown_prompt, image_file=str(temp_image_path))

print(f"\n마크다운 결과:\n{result_markdown}")

# 정리
temp_image_path.unlink()