ini jadi 

In [8]:
import pandas as pd
import psycopg2
from fuzzywuzzy import fuzz
from PIL import Image
import google.generativeai as genai
import json
import re
from sentence_transformers import util
from pydantic import BaseModel
from typing import List
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
import tempfile
import nest_asyncio
import uvicorn

ambil data pusat dari postgre

In [9]:
def load_data_pusat():
    conn = psycopg2.connect(
        dbname="strukAI",
        user="postgres",
        password="bairn1021",
        host="localhost",
        port="5432"
    )
    df = pd.read_sql_query("SELECT * FROM data_pusat", conn)
    conn.close()

    df = df.dropna()
    descriptions = df["Description"].astype(str).str.lower().tolist()
    ids = df["id"].tolist()
    prices = df["Harga Jual ke Konsumen yg Disarankan"].tolist()

    return df, descriptions, ids, prices


OCR menggunakan Gemini

In [10]:
genai.configure(api_key="AIzaSyCG8IGd5lgD4m2UocqUGOyGtyd0jM6O4vU")
model_gemini = genai.GenerativeModel("gemini-2.5-flash")

def OCR_Gemini(image_path):
    img = Image.open(image_path)
    prompt = """
    Ini adalah struk belanja. Tolong ekstrak informasinya dalam format JSON dengan field:
    - invoice_number (string)
    - phone (string)
    - alamat (string)
    - email (string)
    - nama_toko (string)
    - tanggal (string, format DD/MM/YYYY)
    - daftar_barang (array of objects: nama, qty, harga_satuan, subtotal)
    - total_belanja (number)
    Jika ada informasi yang tidak jelas, isi dengan null.
    Hanya kembalikan JSON-nya saja, tanpa penjelasan atau markdown formatting.dalam lowercase
    """
    response = model_gemini.generate_content([prompt, img])
    text = response.text
    cleaned = re.sub(r'^```json|```$', '', text, flags=re.MULTILINE).strip()
    return json.loads(cleaned)


Fungsi untuk menentukan kesamaan data hasil OCR dengan data yang ada di pusat

In [11]:
def match_items(items, descriptions, ids, prices, threshold=0.59):
    result = []

    for item in items:
        prompt = f"""
        Aku memiliki data produk sebagai berikut:
        {descriptions}

        Dari daftar produk di atas, coba cocokan dengan barang berikut:
        "{item['nama']}"

        Jika ada kecocokan, kembalikan dalam format JSON:
        {{
            "id": id_produk,
            "nama": "nama_produk",
            "confidence": tingkat_kepercayaan (0-1)
        }}
        Jika tidak ada kecocokan, balas:
        {{
            "id": null,
            "nama": null,
            "confidence": 0
        }}
        """
        response = model_gemini.generate_content(prompt)
        text = response.text
        cleaned = re.sub(r'^```json|```$', '', text, flags=re.MULTILINE).strip()
        try:
            match_data = json.loads(cleaned)
        except:
            match_data = {"id": None, "nama": None, "confidence": 0}

        confidence = match_data.get('confidence', 0)

        # Normalize confidence if it's a list
        if isinstance(confidence, list) and confidence:
            confidence = confidence[0]
        elif not isinstance(confidence, float):
            confidence = 0

        if confidence >= threshold:
            result.append({
                "id": match_data.get('id'),
                "name": match_data.get('nama'),
                "ocr_result": {
                    "name": item['nama'],
                    "quantity": float(item['qty']),
                    "price": float(item['harga_satuan']),
                    "total": float(item['subtotal']),
                    "accuration": round(confidence, 4)
                }
            })
        else:
            result.append({
                "id": None,
                "name": None,
                "ocr_result": {
                    "name": item['nama'],
                    "quantity": float(item['qty']),
                    "price": float(item['harga_satuan']),
                    "total": float(item['subtotal']),
                    "accuration": round(confidence, 4)
                }
            })
    return result


Menentukan type Outout


In [12]:
class OCRResult(BaseModel):
    name: str | None
    quantity: float | None
    price: float | None
    total: float | None
    accuration: float | None

class ItemMatched(BaseModel):
    id: int| None
    name:str | None
    ocr_result: OCRResult

class Merchant(BaseModel):
    name: str | None
    address:str | None
    phone: str | None
    email: str | None

class FinalOutput(BaseModel):
    invoice_number: str | None
    tanggal: str | None
    merchant: Merchant
    items: List[ItemMatched]
    grand_total: float | None


Set UP fastapi

In [13]:
app = FastAPI()
df, descriptions, ids, prices = load_data_pusat()

@app.post("/struk", response_model=FinalOutput)
async def struk(file: UploadFile = File(...)):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
        tmp.write(await file.read())
        path = tmp.name

    try:
        OCRData = OCR_Gemini(path)
        items = OCRData['daftar_barang']
        matched = match_items(items, descriptions, ids, prices)  # Tanpa vectors dan model

        result = {
            "invoice_number": OCRData['invoice_number'],
            "tanggal": OCRData['tanggal'],
            "merchant": {
                "name": OCRData['nama_toko'],
                "address": OCRData['alamat'],
                "phone": OCRData['phone'],
                "email": OCRData['email']
            },
            "items": matched,
            "grand_total": float(OCRData['total_belanja'])
        }
        return result
    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})

@app.get("/")
def health_check():
    return {"status": "running"}


  df = pd.read_sql_query("SELECT * FROM data_pusat", conn)


In [14]:
nest_asyncio.apply()

uvicorn.run(app, host="127.0.0.1", port=8000)

INFO:     Started server process [15756]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:52240 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:52240 - "GET /openapi.json HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [15756]


In [None]:
from paddleocr import PaddleOCR
import re

# Inisialisasi model PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def ocr_paddle_text(image_path):
    """Menggunakan PaddleOCR untuk ekstrak teks dari gambar"""
    result = ocr.ocr(image_path, cls=True)
    lines = []
    for line in result[0]:
        lines.append(line[1][0])  # ambil teks saja
    return lines

def extract_items_from_lines(lines):
    """Parsing daftar belanja dari hasil OCR lines"""
    items = []
    pattern = re.compile(r'(?P<name>.+?)\s+(?P<qty>\d+)\s+(?P<price>\d+[.,]?\d*)\s+(?P<total>\d+[.,]?\d*)$')

    for line in lines:
        match = pattern.match(line.strip())
        if match:
            nama = match.group("name").strip()
            qty = float(match.group("qty"))
            price = float(match.group("price").replace(',', '.'))
            subtotal = float(match.group("total").replace(',', '.'))
            items.append({
                "nama": nama,
                "qty": qty,
                "harga_satuan": price,
                "subtotal": subtotal
            })
    return items

# === CONTOH PEMAKAIAN ===
if __name__ == "__main__":
    path_gambar = "sample/Receipt NIVEA/ini.jpg"
    lines = ocr_paddle_text(path_gambar)
    daftar_belanja = extract_items_from_lines(lines)

    print("Hasil Deteksi Produk dari Struk:")
    for item in daftar_belanja:
        print(item)


  ocr = PaddleOCR(use_angle_cls=True, lang='en')
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in C:\Users\naeko\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 123.91it/s]


AttributeError: 'paddle.fluid.libpaddle.AnalysisConfig' object has no attribute 'set_optimization_level'