In [None]:
cd ../../datasets/gold/

### With coordinates

In [None]:
import pandas as pd
import json  
from datasets import Dataset
from PIL import Image
from pathlib import Path  

def load_parquet_ocr_with_coords(parquet_path):
    df = pd.read_parquet(parquet_path)
    samples = []

    for _, row in df.iterrows():
        try:
            image_path = Path("scanned") / row["Newspaper"] / "image" / f"{row['Newspaper']}.jpg"
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Image error: {e} for {image_path}")
            continue

      
        try:
            Train_Input = row["Train_Input"]
            if isinstance(Train_Input, str):
                Train_Input = json.loads(Train_Input)  # Use JSON for valid input
        except Exception as e:
            print(f"Failed to parse OCR lines for {row.get('file_id', 'unknown')}: {e}")
            continue

        ocr_prompt_lines = []
        for i, line in enumerate(Train_Input.get("lines", [])):
            if "text" in line:
                coords = line.get("coords", "?")
                ocr_prompt_lines.append(f"{i+1}. ({coords}): {line['text']}")

        ocr_prompt = "OCR lines with coordinates:\n" + "\n".join(ocr_prompt_lines)

        sample = {
            "messages": [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": (
                        "Below is the image of one page of a document, as well as some raw textual content that was "
                        "previously extracted for it. Just return the plain text representation of this document as if "
                        "you were reading it naturally. Do not hallucinate."
                    )}]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": ocr_prompt},
                        {"type": "image", "image": image}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": row["Train_Output"]}]
                }
            ]
        }
        samples.append(sample)

    return Dataset.from_list(samples)


In [None]:

dataset = load_parquet_ocr_with_coords("olmocr_train.parquet")
dataset.save_to_disk("processed_dataset")

### Without coordinates

In [None]:
import pandas as pd
import json  # ← use this instead of ast
from datasets import Dataset, DatasetDict
from PIL import Image
from pathlib import Path  

def load_parquet_ocr(parquet_path):
    df = pd.read_parquet(parquet_path)
    samples = []

    for _, row in df.iterrows():
        try:
            image_path = Path("scanned") / row["Newspaper"] / "image" / f"{row['Newspaper']}.jpg"
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Image error: {e} for {image_path}")
            continue

        # Parse OCR lines using JSON, not ast
        try:
            Train_Input = row["Train_Input"]
            if isinstance(Train_Input, str):
                Train_Input = json.loads(Train_Input)  # Use JSON for valid input
        except Exception as e:
            print(f"Failed to parse OCR lines for {row.get('file_id', 'unknown')}: {e}")
            continue

        ocr_prompt_lines = []
        for i, line in enumerate(Train_Input.get("lines", [])):
            if "text" in line:
                ocr_prompt_lines.append(f"{i+1}. {line['text']}")

        ocr_prompt = "\n".join(ocr_prompt_lines)

        sample = {
            "messages": [
            {
            "role": "system",
            "content": [{"type": "text", "text": (
                "Below is the image of one page of a document, as well as some raw textual content that was "
                "previously extracted for it. Just return the plain text representation of this document as if "
                "you were reading it naturally. Do not hallucinate."
            )}]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"{ocr_prompt}\n"  #
                },
                {
                    "type": "image",
                    "image": image  
                }
            ]
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": row["Train_Output"]}]
        }
    ]
}
        samples.append(sample)

    full_dataset = Dataset.from_list(samples)
    split_dataset = full_dataset.train_test_split(test_size=0.1, seed=42)

    return split_dataset  


In [None]:
dataset = load_parquet_ocr("olmocr_train.parquet")
dataset.save_to_disk("processed_dataset_nc")

In [None]:
import io
from PIL import Image

def extract_image(example):
    """Extract image from example, return None if no image found"""
    try:
        for message in example["messages"]:
            if message["role"] == "user":
                for item in message["content"]:
                    if item["type"] == "image":
                        img_data = item["image"]
                        if isinstance(img_data, dict) and "bytes" in img_data:
                            img = Image.open(io.BytesIO(img_data["bytes"])).convert("RGB")
                            print(f"  Extracted image size: {img.size}")
                            return img
                        elif isinstance(img_data, Image.Image):
                            print(f"  Found PIL Image, size: {img_data.size}")
                            return img_data
                        else:
                            print(f"Warning: Unsupported image format: {type(img_data)}")
                            return None
        return None  # No image found
    except Exception as e:
        print(f"Error extracting image: {e}")
        return None

def has_image(example):
    """Check if example has an image"""
    return extract_image(example) is not None

def debug_text_content(example):
    """Debug the text content to check for image references"""
    for message in example["messages"]:
        if message["role"] == "user":
            for item in message["content"]:
                if item["type"] == "text":
                    text = item.get("text", "")
                    # Count potential image tokens/references
                    image_refs = text.count("<image>") + text.count("[IMAGE]") + text.count("<img>")
                    if image_refs > 0:
                        print(f"  Text contains {image_refs} image references")
                    print(f"  Text preview: {text[:200]}...")

def debug_dataset_detailed(dataset, num_examples=5):
    """Enhanced debug function to check dataset structure"""
    print(f"Debugging first {num_examples} examples in detail:")
    
    for i in range(min(num_examples, len(dataset['train']))):
        print(f"\n=== Example {i} ===")
        example = dataset['train'][i]
        
        # Check if example has image
        has_img = has_image(example)
        print(f"Has image: {has_img}")
        
        # Debug message structure
        for j, message in enumerate(example["messages"]):
            print(f"Message {j} - Role: {message['role']}")
            content = message.get("content", [])
            print(f"  Content items: {len(content)}")
            
            for k, item in enumerate(content):
                item_type = item.get("type", "unknown")
                print(f"    Item {k}: type={item_type}")
                
                if item_type == "image":
                    img_data = item.get("image")
                    if isinstance(img_data, dict) and "bytes" in img_data:
                        print(f"      Image data: dict with bytes ({len(img_data['bytes'])} bytes)")
                    else:
                        print(f"      Image data type: {type(img_data)}")
                
                elif item_type == "text":
                    text = item.get("text", "")
                    # Check for image references in text
                    image_tokens = text.count("<image>")
                    print(f"      Text length: {len(text)}, <image> tokens: {image_tokens}")
                    if image_tokens > 1:
                        print(f"      WARNING: Multiple <image> tokens found!")


# Run the detailed debugging
debug_dataset_detailed(dataset)

# Test with your original examples
examples = [dataset['train'][0], dataset['train'][1]]



In [None]:
from datasets import Dataset, DatasetDict, Features, Value, Image as HfImage

def load_parquet_ocr(parquet_path):
    df = pd.read_parquet(parquet_path)
    samples = []

    for _, row in df.iterrows():
        try:
            image_path = Path("scanned") / row["Newspaper"] / "image" / f"{row['Newspaper']}.jpg"
            if not image_path.exists():
                print(f"Missing image: {image_path}")
                continue
        except Exception as e:
            print(f"Image error: {e} for {image_path}")
            continue

        try:
            Train_Input = json.loads(row["Train_Input"]) if isinstance(row["Train_Input"], str) else row["Train_Input"]
        except Exception as e:
            print(f"Failed to parse OCR lines for {row.get('file_id', 'unknown')}: {e}")
            continue

        ocr_prompt_lines = [f"{i+1}. {line['text']}" for i, line in enumerate(Train_Input.get("lines", [])) if "text" in line]
        ocr_prompt = "\n".join(ocr_prompt_lines)

        sample = {
            "image": str(image_path),  # Store path instead of PIL.Image
            "messages": [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": (
                        "Below is the image of one page of a document, as well as some raw textual content that was "
                        "previously extracted for it. Just return the plain text representation of this document as if "
                        "you were reading it naturally. Do not hallucinate."
                    )}]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": ocr_prompt},
                        {"type": "image", "image": str(image_path)}  # Optional, see note below
                    ]
                },
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": row["Train_Output"]}]
                }
            ]
        }
        samples.append(sample)

    # Define custom features so images are properly loaded by HF datasets
    features = Features({
        "image": HfImage(),  # This tells HF to load images from file paths
        "messages": Value("string"),  # Store messages as serialized JSON string
    })

    # Serialize messages for HF Dataset
    for s in samples:
        s["messages"] = json.dumps(s["messages"])

    dataset = Dataset.from_list(samples, features=features)
    split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
    return split_dataset


In [None]:
dataset = load_parquet_ocr("olmocr_train.parquet")
dataset.save_to_disk("processed_dataset_v3")

In [None]:
import io
from PIL import Image
from pathlib import Path

def extract_image(example):
    """Extract image from example, return None if no image found"""
    try:
        for message in example["messages"]:
            if message["role"] == "user":
                for item in message["content"]:
                    if item["type"] == "image":
                        img_data = item["image"]
                        if isinstance(img_data, str):  # Path to image
                            img_path = Path(img_data)
                            if img_path.exists():
                                img = Image.open(img_path).convert("RGB")
                                print(f"  Loaded image from path: {img_path}, size: {img.size}")
                                return img
                            else:
                                print(f"  Image path does not exist: {img_path}")
                                return None
                        elif isinstance(img_data, dict) and "bytes" in img_data:
                            img = Image.open(io.BytesIO(img_data["bytes"])).convert("RGB")
                            print(f"  Extracted image from bytes, size: {img.size}")
                            return img
                        elif isinstance(img_data, Image.Image):
                            print(f"  Found PIL Image, size: {img_data.size}")
                            return img_data
                        else:
                            print(f"Warning: Unsupported image format: {type(img_data)}")
                            return None
        return None  # No image found
    except Exception as e:
        print(f"Error extracting image: {e}")
        return None

def has_image(example):
    """Check if example has an image"""
    return extract_image(example) is not None

def debug_text_content(example):
    """Debug the text content to check for image references"""
    for message in example["messages"]:
        if message["role"] == "user":
            for item in message["content"]:
                if item["type"] == "text":
                    text = item.get("text", "")
                    image_refs = text.count("<image>") + text.count("[IMAGE]") + text.count("<img>")
                    if image_refs > 0:
                        print(f"  Text contains {image_refs} image references")
                    print(f"  Text preview: {text[:200]}...")

def debug_dataset_detailed(dataset, num_examples=5):
    """Enhanced debug function to check dataset structure"""
    print(f"Debugging first {num_examples} examples in detail:")
    
    for i in range(min(num_examples, len(dataset['train']))):
        print(f"\n=== Example {i} ===")
        example = dataset['train'][i]

        # 💡 Fix: Parse messages from JSON string
        try:
            messages = json.loads(example["messages"])
        except Exception as e:
            print(f"  ERROR parsing messages: {e}")
            continue
        
        # Check if example has image
        example["messages"] = messages  # temporarily inject parsed messages
        has_img = has_image(example)
        print(f"Has image: {has_img}")
        
        # Debug message structure
        for j, message in enumerate(messages):
            print(f"Message {j} - Role: {message['role']}")
            content = message.get("content", [])
            print(f"  Content items: {len(content)}")
            
            for k, item in enumerate(content):
                item_type = item.get("type", "unknown")
                print(f"    Item {k}: type={item_type}")
                
                if item_type == "image":
                    img_data = item.get("image")
                    print(f"      Image data: {img_data}")  # Should be path
                elif item_type == "text":
                    text = item.get("text", "")
                    image_tokens = text.count("<image>")
                    print(f"      Text length: {len(text)}, <image> tokens: {image_tokens}")
                    if image_tokens > 1:
                        print(f"      WARNING: Multiple <image> tokens found!")

# Example usage:
debug_dataset_detailed(dataset)

# If needed: test specific samples
# examples = [dataset['train'][0], dataset['train'][1]]


## Block dataset


In [None]:
from datasets import Dataset, DatasetDict, Features, Value, Image as HfImage
import pandas as pd
import json  
from PIL import Image
from pathlib import Path 

MAX_CHARS = 6000

def load_block_parquet_ocr(parquet_path):
    df = pd.read_parquet(parquet_path)
    df = df[df["Block"].notna() & (df["Block"].str.strip() != "")]
    #print(df)
    
    samples = []

    for _, row in df.iterrows():
        Train_Input = None
        Train_Output = None
        ocr_prompt = ""
        ocr_prompt_assist = ""
    
        try:
            image_path = Path("scanned") / row["Newspaper"] / "image" / f"{row['Block']}.jpg"
            if not image_path.exists():
                print(f"Missing image: {image_path}")
                continue
        except Exception as e:
            print(f"Image error: {e} for {image_path}")
            continue
        
        try:
            # Parse Train_Input
            if isinstance(row["Train_Input"], dict):
                Train_Input = row["Train_Input"]
            elif isinstance(row["Train_Input"], str) and row["Train_Input"].strip().startswith("{"):
                Train_Input = json.loads(row["Train_Input"])
            else:
                Train_Input = {"lines": [{"text": line.strip()} for line in row["Train_Input"].split("\n") if line.strip()]}
    
            # Parse Train_Output
            if isinstance(row["Train_Output"], dict):
                Train_Output = row["Train_Output"]
            elif isinstance(row["Train_Output"], str) and row["Train_Output"].strip().startswith("{"):
                Train_Output = json.loads(row["Train_Output"])
            else:
                Train_Output = {"lines": [{"text": line.strip()} for line in row["Train_Output"].split("\n") if line.strip()]}
    
            # Build prompt text from parsed structures
            ocr_prompt = "\n".join(line["text"] for line in Train_Input.get("lines", []))
            ocr_prompt_assist = "\n".join(line["text"] for line in Train_Output.get("lines", []))
    
        except Exception as e:
            print(f"Failed to parse OCR lines for {row.get('Block', 'unknown')}: {e}")
            continue
        
        sample = {
            "image": str(image_path),
            "messages": [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": (
                        "Below is the image of one page of a document, as well as some raw textual content that was "
                        "previously extracted for it. Just return the plain text representation of this document as if "
                        "you were reading it naturally. Do not hallucinate."
                    )}]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": ocr_prompt},
                        {"type": "image", "image": str(image_path)}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": ocr_prompt_assist}]
                }
            ]
        }
        samples.append(sample)


    # Define custom features so images are properly loaded by HF datasets
    features = Features({
        "image": HfImage(),  # This tells HF to load images from file paths
        "messages": Value("string"),  # Store messages as serialized JSON string
    })

    # Serialize messages for HF Dataset
    for s in samples:
        s["messages"] = json.dumps(s["messages"])

    dataset = Dataset.from_list(samples, features=features)
    split_dataset = dataset.train_test_split(test_size=0.1, seed=42)

# Secondary split: 10% of the training set → validation
    split_dataset = DatasetDict({
    "train": split_dataset["train"].train_test_split(test_size=0.1, seed=42)["train"],
    "validation": split_dataset["train"].train_test_split(test_size=0.1, seed=42)["test"],
    "test": split_dataset["test"]
    })
    return split_dataset


In [None]:
dataset = load_block_parquet_ocr("olmocr_train.parquet")
dataset.save_to_disk("block_dataset")

In [None]:
len(dataset['validation'])

In [None]:
from pprint import pprint

for example in dataset["validation"]:
    messages = json.loads(example["messages"])  # stored as a serialized string
    assistant_msg = next((m for m in messages if m["role"] == "assistant"), None)
    print("---")
    assistant_text = assistant_msg['content'][0]['text']
    #if assistant_text is None:
    #    print(1)
    pprint(assistant_msg['content'][0]['text'])
#print(assistant_msg['content'][1]["image"])

In [None]:
from pprint import pprint
example = dataset["validation"][0]
trg = example['image'].filename

pprint(type(trg))