In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!pip install datasets



In [3]:
import os
from tqdm import tqdm
from datasets import Dataset
from PIL import Image, ImageDraw
import xml.etree.ElementTree as ET
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
import torch
import logging

In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO)

# Set up paths
data_dir = "/content/drive/MyDrive/Colab Notebooks/mathwriting-2024-excerpt/"
train_dir = os.path.join(data_dir, "train/")
valid_dir = os.path.join(data_dir, "valid/")
symbols_dir = os.path.join(data_dir, "symbols/")

# Define the namespace
NAMESPACE = {'ink': 'http://www.w3.org/2003/InkML'}

In [5]:
def parse_inkml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    normalized_label_elem = root.find(".//ink:annotation[@type='normalizedLabel']", NAMESPACE)
    label_elem = root.find(".//ink:annotation[@type='label']", NAMESPACE)

    label = None
    if normalized_label_elem is not None:
        label = normalized_label_elem.text
    elif label_elem is not None:
        label = label_elem.text
    else:
        logging.warning(f"No label found in {file_path}")
        return None, None

    trace_groups = root.findall(".//ink:trace", NAMESPACE)
    strokes = [trace.text.strip().split(',') for trace in trace_groups]

    return label, strokes

In [8]:
def create_image_from_strokes(strokes, width=256, height=256):
    image = Image.new('RGB', (width, height), color='white')
    draw = ImageDraw.Draw(image)

    for stroke in strokes:
        points = [(float(x), float(y)) for x, y in [point.split()[:2] for point in stroke]]
        draw.line(points, fill='black', width=2)

    return image

In [9]:
def load_mathwriting_data(directory):
    images = []
    texts = []

    for filename in tqdm(os.listdir(directory), desc=f"Processing {directory}"):
        if filename.endswith('.inkml'):
            file_path = os.path.join(directory, filename)
            label, strokes = parse_inkml(file_path)
            if label is not None and strokes is not None:
                image = create_image_from_strokes(strokes)
                images.append(image)
                texts.append(label)

    return images, texts

In [10]:
# Load train and validation data
train_images, train_texts = load_mathwriting_data(train_dir)
valid_images, valid_texts = load_mathwriting_data(valid_dir)

Processing /content/drive/MyDrive/Colab Notebooks/mathwriting-2024-excerpt/train/: 100%|██████████| 100/100 [00:03<00:00, 26.60it/s]
Processing /content/drive/MyDrive/Colab Notebooks/mathwriting-2024-excerpt/valid/: 100%|██████████| 100/100 [00:03<00:00, 28.10it/s]


In [11]:
# Optionally, load symbols data
symbols_images, symbols_texts = load_mathwriting_data(symbols_dir)

Processing /content/drive/MyDrive/Colab Notebooks/mathwriting-2024-excerpt/symbols/: 100%|██████████| 100/100 [00:03<00:00, 30.74it/s]


In [12]:
# Combine train and symbols data
train_images += symbols_images
train_texts += symbols_texts

In [13]:
# Create datasets
train_dataset = Dataset.from_dict({"image": train_images, "text": train_texts})
valid_dataset = Dataset.from_dict({"image": valid_images, "text": valid_texts})

In [14]:
valid_dataset

Dataset({
    features: ['image', 'text'],
    num_rows: 100
})

In [15]:
# Load TrOCR processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
# Set the decoder_start_token_id and other configuration parameters
model.config.decoder_start_token_id = processor.tokenizer.bos_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

In [17]:
# Move model to GPU if available
try:
    device = torch.device("cpu")
    model.to(device)
    logging.info(f"Using device: {device}")
except Exception as e:
    logging.error(f"Error during device setup: {e}")

In [18]:
# Preprocess the dataset
def preprocess_data(examples):
    images = [image.convert("RGB") for image in examples["image"]]
    texts = examples["text"]
    encoded_inputs = processor(images=images, text=texts, padding="max_length", truncation=True)
    return encoded_inputs

In [19]:
train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=train_dataset.column_names)
valid_dataset = valid_dataset.map(preprocess_data, batched=True, remove_columns=valid_dataset.column_names)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  return self.preprocess(images, **kwargs)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [20]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    fp16=True,
    output_dir="./trocr_mathwriting_output",
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
    learning_rate=4e-5,
    save_total_limit=2,
)



In [21]:
# Define compute metrics function
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    accuracy = sum([pred == label for pred, label in zip(pred_str, label_str)]) / len(pred_str)

    return {"accuracy": accuracy}



In [22]:
# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=default_data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [23]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss


TrainOutput(global_step=36, training_loss=2.5923040178087025, metrics={'train_runtime': 277.718, 'train_samples_per_second': 2.16, 'train_steps_per_second': 0.13, 'total_flos': 4.310122700136776e+17, 'train_loss': 2.5923040178087025, 'epoch': 2.88})

In [26]:
# Save the model and processor
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/trocr_mathwriting_model")
processor.save_pretrained("/content/drive/MyDrive/Colab Notebooks/trocr_mathwriting_processor")

[]

In [None]:
# Function to recognize text from an image
def recognize_text_from_image(image_path, model, processor, device):
    try:
        image = Image.open(image_path).convert("RGB")
        pixel_values = processor(images=image, return_tensors="pt").pixel_values
        pixel_values = pixel_values.to(device)

        # Generate text
        generated_ids = model.generate(pixel_values)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return generated_text
    except Exception as e:
        logging.error(f"Error recognizing text from image {image_path}: {e}")
        return None



In [None]:
# Test the model with a new image
try:
    test_image_path = "C:/Users/joana/OneDrive/Desktop/HSLU/3rd_semester/CV/c_vision_ocr/data/img_nine.png"
    recognized_text = recognize_text_from_image(test_image_path, model, processor, device)
    if recognized_text:
        logging.info(f"Recognized text: {recognized_text}")
except Exception as e:
    logging.error(f"Error during recognition: {e}")