In [5]:
import cv2
import easyocr
import json
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import re

# EasyOCR reader 객체 생성 (한글 및 영어 지원)
reader = easyocr.Reader(['ko', 'en'], gpu=False)

def process_image_and_extract_info(image_path='bill2.jpeg', json_file_path='extracted_info.json'):
    """
    이미지를 처리하여 텍스트를 추출하고, 다양한 정보를 추출하여 JSON 파일로 저장하는 함수.
    
    Args:
        image_path (str): 입력 이미지 경로 (기본값: 'bill2.jpeg').
        json_file_path (str): 출력 JSON 파일 경로 (기본값: 'extracted_info.json').
    """
    # 이미지 읽기
    image = cv2.imread(image_path)
    if image is None:
        raise Exception(f"Could not read the image from {image_path}")
    
    # 이미지에서 텍스트 추출
    results = reader.readtext(image)
    
    # 추출된 텍스트를 저장할 리스트
    extracted_text = []
    
    # 파란색 상자를 그리기 위한 PIL 이미지로 변환
    img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(img)
    
    # 각 텍스트 주위에 상자를 그리고 텍스트를 저장
    for detection in results:
        bbox = detection[0]
        text = detection[1]
        extracted_text.append(text)
        
        # 상자 좌표
        x_min, y_min = int(bbox[0][0]), int(bbox[0][1])
        x_max, y_max = int(bbox[2][0]), int(bbox[2][1])
        
        # 상자 그리기
        draw.rectangle([x_min, y_min, x_max, y_max], outline="blue", width=2)
        draw.text((x_min, y_min - 20), text, fill="blue")
    
    # 정보 추출 함수들
    def extract_business_numbers(text_list):
        business_number_pattern = re.compile(r'\b\d{3}-\d{2}-\d{5}\b')
        return [match for text in text_list for match in business_number_pattern.findall(text)]
    
    def extract_prices(text_list):
        price_pattern = re.compile(r'결제금액.*?([₩$€]?\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)')
        return [match for text in text_list for match in price_pattern.findall(text)]
    
    def extract_store_names(text_list):
        store_name_pattern = re.compile(r'(상호명|회사명|업체명|가맹점명|가맣점명)\s*[:：]?\s*(\S+)')
        return [match[1] for text in text_list for match in store_name_pattern.findall(text)]
    
    def extract_transaction_date(text_list):
        date_pattern = re.compile(r'(거래일시|결제일시|거래일|결제날짜|날짜)\s*[:：]?\s*(\d{4}[-/]\d{2}[-/]\d{2})\s+(\d{2}):\s?(\d{2}):\s?(\d{2})')
        return [f"{match[1]} {match[2]}:{match[3]}:{match[4]}" for text in text_list for match in date_pattern.findall(text)]
    
    def extract_item_prices(text_list):
        item_price_pattern = re.compile(r'(품목|아이템|상품)\s*[:：]?\s*(.*?)\s*가격\s*[:：]?\s*([₩$€]?\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)')
        return [(match[1], match[2]) for text in text_list for match in item_price_pattern.findall(text)]
    
    def extract_total_price(text_list):
        total_price_pattern = re.compile(r'총\s*가격\s*[:：]?\s*([₩$€]?\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)')
        return [match for text in text_list for match in total_price_pattern.findall(text)]
    
    # 정보 추출
    info_dict = {
        "사업자번호": extract_business_numbers(extracted_text),
        "가격": extract_prices(extracted_text),
        "가맹점명": extract_store_names(extracted_text),
        "거래일시": extract_transaction_date(extracted_text),
        "품목별 가격": extract_item_prices(extracted_text),
        "총 가격": extract_total_price(extracted_text)
    }
    
    # JSON 파일로 저장
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(info_dict, json_file, ensure_ascii=False, indent=4)
    print(f"Extracted information has been saved to {json_file_path}")
    
    # 결과 출력
    print("추출된 정보:", json.dumps(info_dict, indent=4, ensure_ascii=False))
    
    return info_dict

# 이미지 처리 및 정보 추출
extracted_info = process_image_and_extract_info()



Using CPU. Note: This module is much faster with a GPU.


Extracted information has been saved to extracted_info.json
추출된 정보: {
    "사업자번호": [
        "307-21-61771"
    ],
    "가격": [],
    "\b가맹점명": [
        "타임유"
    ],
    "거래일시": [
        "2024-07-20 10:02:16"
    ],
    "품목별 가격": [],
    "총 가격": []
}
