In [None]:
from datasets import load_dataset
dataset = load_dataset("IRI2070/shenas-verified-bookroom-persian-book-covers-and-titles")

In [4]:
system_prompt = """
You are a **precise and meticulous Book Cover Text Extractor AI**, specialized in accurately transcribing visible text from book cover images. Your role is to act as an impartial optical text recognizer, focusing solely on faithful reproduction of on-cover text without any interpretation, inference, or external knowledge.

#### Core Task
When given an image of a book cover, extract **ONLY** the text that is visibly printed on the cover and structure it into a valid JSON object.

#### Key Fields (use exactly these keys; add more only if clearly distinct text categories appear on the cover):
- "Original Title": Main title text (largest/most prominent).
- "Subtitle": Any secondary title or tagline below the main title.
- "Extra title": Additional series, collection, or promotional text (e.g., "از سری مجموعه مدیر موفق").
- "Author(s)": Author names as an array of strings (split multiple authors logically by visible separators like "و" or commas).
- "Editor(s)": Editor names as an array or null.
- "Translator(s)": Translator names as an array or null.
- "Printing Order": Text like "چاپ اول" or "چاپ بیستم".
- "Edition": Edition information (e.g., "ویرایش دوم").
- "Publisher": Publisher name.
- "Publication Year": Year as string (e.g., "۱۳۹۲").
- Add other keys only if a distinct label exists (e.g., "ISBN", "Series"); use descriptive keys like "Back Cover Blurb" for longer texts.

#### Strict Rules (Must Follow Without Exception)
- **Do not add, infer, guess, or hallucinate** any text, fields, or values not explicitly visible.
- **Do not translate, summarize, paraphrase, correct, or normalize** text (e.g., keep original language, script, and formatting).
- **Do not replace** words with abbreviations, symbols, or shortcuts (e.g., keep "علیه‌السلام" exactly, never "(ع)").
- **Preserve exactly**: spelling, punctuation, capitalization, spacing, diacritics, and line breaks (use "\n" for line breaks within a string value).
- If multiple instances of similar text appear (e.g., title on spine and front), prioritize the front cover's most prominent version.
- If a field has no visible text, set it to `null`.
- For lists (e.g., authors): Use arrays of exact strings; split based on visible separators.
- Handle multilingual covers by keeping text in its original script/language.

#### Best Practices
- Carefully examine the entire image: front, spine, back, and any flaps for all visible text.
- Distinguish between design elements (e.g., logos) and actual text.
- For blurry, stylized, or rotated text: Transcribe as accurately as readable; if unreadable, omit rather than guess.
- Prioritize clarity and fidelity over completeness—if uncertain, use `null`.
- Always output **only** valid, parseable JSON (no extra text, explanations, or markdown).
- Start directly with `{` and end with `}`.

#### Example Output
```json
{
  "Main Title": "هنر شفاف اندیشیدن",
  "Subtitle": "تصمیم‌های بهتر در زندگی روزمره",
  "Extra title": "از سری مجموعه مدیر موفق",
  "Author(s)": ["رولف دوبلی"],
  "Editor(s)": null,
  "Translator(s)": ["عادل فردوسی‌پور"],
  "Printing Order": "چاپ اول",
  "Edition": "ویرایش دوم",
  "Publisher": "نشر چشمه",
  "Publication Year": "۱۳۹۲"
}
```
"""

In [5]:
book_info_schema = {
    "name": "extract_book_info",
    "schema": {
        "type": "object",
        "properties": {
            "Main Title": {"type": "string"},
            "Subtitle": {"type": "string"},
            "Extra title": {"type": "string"},
            "Author(s)": {"type": "array", "items": {"type": "string"}},
            "Editor(s)": {"type": "array", "items": {"type": "string"}},
            "Translator(s)": {"type": "array", "items": {"type": "string"}},
            "Printing Order": {"type": "string"},
            "Edition": {"type": "string"},
            "Publisher": {"type": "string"},
            "Publication Year": {"type": "string"},
        },
        "required": [
            "Main Title",
            "Subtitle",
            "Extra title",
            "Author(s)",
            "Editor(s)",
            "Translator(s)",
            "Printing Order",
            "Edition",
            "Publisher",
            "Publication Year"
        ],
        "additionalProperties": False,
    },
}

In [7]:
import base64
from openai import OpenAI
import json
from kaggle_secrets import UserSecretsClient
import io
import base64

client = OpenAI(
    api_key=UserSecretsClient().get_secret('AVAL_AI_API_KEY'),
    base_url='https://api.avalai.ir/v1',
)

books_info = []

for index, sample in enumerate(dataset['train'].select(range(0, 100))):
    print(index)

    buffered = io.BytesIO()
    
    sample['image'].save(buffered, format="WEBP")
    
    base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
    
    main_title = sample['text']

    print(main_title)
    
    response = client.chat.completions.create(
        model="gpt-5-mini",
        messages=[
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": system_prompt},
                ],
            },
            {
                "role": "user",
                "content": [
                    {"type": "text",
                     "text": f"Extract visible text from the book cover image exactly as it appears—no additions, inferences, translations, or changes—and output only valid JSON with keys like \"Main Title\", \"Subtitle\", \"Author(s)\" (array), etc., using null for missing fields while preserving spelling, punctuation, spacing, scripts, and line breaks. The main title of the book should probably be something like this: \"{main_title}\""},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high"
                        },
                    },
                ],
            }
        ],
        response_format={"type": "json_schema", "json_schema": book_info_schema}
    )
    
    if response.choices:
        message_content = response.choices[0].message.content
        try:
            event_data = json.loads(message_content)
            books_info.append(event_data)
            print(event_data)
        except json.JSONDecodeError:
            print("Error: Could not decode JSON response:", message_content)
        except Exception as e:
            print(f"Error processing response: {e}")
    else:
        print("No response generated.")

0
روزنه ای به چشمه های نور
{'Main Title': 'روزنه ای به چشمه های نور', 'Subtitle': 'مجموعه اشعار', 'Extra title': 'دفتر دوم', 'Author(s)': ['محمدرضا امینی علویچه'], 'Editor(s)': None, 'Translator(s)': None, 'Printing Order': None, 'Edition': None, 'Publisher': None, 'Publication Year': None}
1
ساواک و مرجعیت (1357 - 1335)
{'Main Title': 'ساواک و مرجعیت', 'Subtitle': '۱۳۳۵-۱۳۵۷', 'Extra title': None, 'Author(s)': ['دکتر سید محسن طباطبایی\u200cنیافر'], 'Editor(s)': None, 'Translator(s)': None, 'Printing Order': None, 'Edition': None, 'Publisher': 'مؤسسه مطالعات و پژوهشهای سیاسی', 'Publication Year': None}
2
پسر خدا
{'Main Title': 'پسر\nخدا', 'Subtitle': None, 'Extra title': 'رمان', 'Author(s)': ['محمدهادی\nعبدالوهاب'], 'Editor(s)': None, 'Translator(s)': None, 'Printing Order': None, 'Edition': None, 'Publisher': None, 'Publication Year': None}
3
نمک در چشمانم می ریزم
{'Main Title': 'نمک در\nچشمانم می ریزم', 'Subtitle': 'رمان نوجوان', 'Extra title': None, 'Author(s)': None, 'Editor(s)': N

In [8]:
import pandas as pd

df = pd.DataFrame(books_info)

df.to_csv('books_info_train_0_100.csv', index=False, encoding='utf-8')