# DiffEq detection

In [147]:
# tools
import os
from tqdm.auto import tqdm

# essentials
import numpy as np
from matplotlib import pyplot as plt
import cv2 as cv
import torch
from torchvision.transforms import v2
import torchvision as torchv
from torchvision.transforms import v2

# Plan

1) Find images

2) Label images using [labelme](https://github.com/wkentaro/labelme?tab=readme-ov-file) tool

3) Create train/test split

4) Train models

5) Evaluate models

6) Choose model

### 1. Find images

We are going to use images from:

- PDF files containing math expressions
- our handwritten calculus notes

The images are going to be preprocessed in some way

The ImagePreparator class has been created for repeatability of preprocessing

In [148]:
import pymupdf
from PIL import Image
import pyheif
import os

class RawImagesPreparator:
    def __init__(self, pdf_data_dir_path=None, pdf_to_image_save_path=None, images_path=None, verbose=False):
        self.pdf_data_dir_path = pdf_data_dir_path
        self.pdf_to_image_save_path = pdf_to_image_save_path
        self.images_path = images_path
        self.verbose = verbose
        self.heic_converted = False

    def pdf_pages_to_images(self, logging=False):
        if not all(
            (
                os.path.isdir(self.pdf_to_image_save_path),
                os.path.isdir(self.pdf_data_dir_path),
            )
        ):
            return None
        if self.verbose:
            print("pdf_pages_to_images: the provided paths is ok...")

        pdf_files_dir = os.listdir(self.pdf_data_dir_path)
        for file in pdf_files_dir:
            images_dir = self.pdf_to_image_save_path + f"/{file.split('.')[0]}"
            # if not os.path.isdir(images_dir):
            #     os.mkdir(images_dir)

            doc_path = self.pdf_data_dir_path + f"/{file}"
            document = pymupdf.open(doc_path)
            for page in tqdm(document):
                picture = page.get_pixmap()
                picture.save("{}/page-{}.png".format(images_dir, page.number + 1))
            if self.verbose:
                print("pdf_pages_to_images: the job for {} is done".format(file))

        return True

    def heic_to_png(self):
        if not os.path.isdir(self.images_path):
            if self.verbose:
                print("heic_to_png: Unlabeled images data directory does not exist.")
            return None

        heic_files = [f for f in os.listdir(self.images_path) if f.lower().endswith(".heic")]
        if not heic_files:
            if self.verbose:
                print("heic_to_png: No HEIC files found.")
            self.heic_converted = True
            return None

        for heic_file in heic_files:
            heic_path = os.path.join(self.images_path, heic_file)
            jpg_path = os.path.join(self.images_path, f"{os.path.splitext(heic_file)[0]}.jpg")

            # Convert HEIC to JPG
            try:
                heif_file = pyheif.read(heic_path)
                image = Image.frombytes(
                    heif_file.mode,
                    heif_file.size,
                    heif_file.data,
                    "raw",
                    heif_file.mode,
                    heif_file.stride,
                )
                image.save(jpg_path, "JPEG")
                if self.verbose:
                    print(f"heic_to_png: Converted {heic_file} to {jpg_path} successfully.")
            except Exception as e:
                if self.verbose:
                    print(f"heic_to_png: Failed to convert {heic_file}. Error: {str(e)}")

        self.heic_converted = True
        return True

    def heic_delete(self, custom_path=None, force_delete=False):
        if not (self.heic_converted or force_delete):
            if self.verbose:
                print("heic_delete: no heic_files are deleted.")
            return None

        if custom_path == None:
            custom_path = self.images_path

        if not os.path.isdir(custom_path):
            if self.verbose:
                print("heic_to_png: Unlabeled images data directory does not exist.")
            return None

        heic_files = [f for f in os.listdir(custom_path) if f.lower().endswith(".heic")]
        if not heic_files:
            if self.verbose:
                print("heic_to_png: No HEIC files found.")
            return None

        for heic_file in heic_files:
            heic_path = os.path.join(self.images_path, heic_file)

            # Delete Heic
            try:
                os.remove(heic_path)
                if self.verbose:
                    print(f"heic_to_png: {heic_path} is deleted successfully.")
            except Exception as e:
                if self.verbose:
                    print(f"heic_to_png: Failed to convert {heic_path}. Error: {str(e)}")

        return True

In [149]:
# im_prep = RawImagesPreparator(images_path="../data/images", verbose=True)
# im_prep.heic_to_png()
# im_prep.heic_delete()

heic_to_png: No HEIC files found.
heic_to_png: No HEIC files found.


## 2. Labeling is done using labelme tool

In [150]:
# get dataframe from images
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

class DataFrameFromImages:

    def __init__(self, dir_path="../data", images_dir_name="images/images_all", labels_dir_name="labels/labels_all"):
        self.dir_path = dir_path
        self.images_dir_name = images_dir_name
        self.labels_dir_name = labels_dir_name
        self.df = None

    def create_dataframe(self):
        # labels and images have to be in one directory
        path = os.path.join(self.dir_path)
        images_path = os.path.join(path, self.images_dir_name)
        labels_path = os.path.join(path, self.labels_dir_name)

        if not all((os.path.isdir(self.dir_path), os.path.isdir(images_path), os.path.isdir(labels_path))):
            print(
                "create_dataframe: one of provided paths:\n{}\n{}\n{}\n - is not valid.".format(
                    self.dir_path, images_path, labels_path
                )
            )
            return None

        images_files = [f for f in os.listdir(images_path) if not os.path.isdir(os.path.join(images_path, f))]
        labels_files = [f[:-5] for f in os.listdir(labels_path) if not os.path.isdir(os.path.join(images_path, f))]
        has_label = [1 if f.rsplit(".", 1)[0] in set(labels_files) else 0 for f in os.listdir(images_path) if not os.path.isdir(os.path.join(images_path, f))]

        labels_files = [f for f in os.listdir(labels_path)]

        try:
            self.df = pd.DataFrame({"img_name": images_files, "label": has_label})
            return self.df
        except:
            return None

    def train_test_folder(self, stratify=None, train_size=0.85, verbose=False):
        if self.df is None:
            print("train_test_folder: \nFirstly, create a dataframe.")
            return None

        if stratify is None:
            stratify = self.df["label"]

        train_images, val_images = train_test_split(self.df, train_size=train_size, stratify=stratify, shuffle=True)

        images_path = os.path.join(self.dir_path, self.images_dir_name)
        labels_path = os.path.join(self.dir_path, self.labels_dir_name)

        def make_copies(images_df, postfix, verbose):
            for ind in range(len(images_df)):
                train_image = images_df.iloc[ind].to_numpy()
                has_label = train_image[1]

                if verbose:
                    print("img:", os.path.join(images_path, train_image[0]), os.path.join(images_path, postfix, train_image[0]))
                try:
                    shutil.copy(os.path.join(images_path, train_image[0]), os.path.join(images_path, postfix, train_image[0]))
                except Exception as e:
                    if not verbose:
                        print("img:", os.path.join(images_path, train_image[0]), os.path.join(images_path, postfix, train_image[0]))
                    print("\nException: {}\n".format(e))
                
                if has_label:

                    if verbose:
                        print("label:", os.path.join(os.path.join(labels_path, train_image[0].rsplit(".", 1)[0] + ".json")), os.path.join(labels_path, postfix, train_image[0].rsplit(".", 1)[0] + ".json"))
                    try:
                        shutil.copy(os.path.join(os.path.join(labels_path, train_image[0].rsplit(".", 1)[0] + ".json")), os.path.join(labels_path, postfix, train_image[0].rsplit(".", 1)[0] + ".json"))
                    except Exception as e:
                        if not verbose:
                            print("label:", os.path.join(os.path.join(labels_path, train_image[0].rsplit(".", 1)[0] + ".json")), os.path.join(labels_path, postfix, train_image[0].rsplit(".", 1)[0] + ".json"))
                        print("\nException: {}\n".format(e))
        try:
            make_copies(train_images, "train", verbose)
            make_copies(val_images, "val", verbose)
        except Exception as e:
            print("Exception: {}".format(e))


In [165]:
df_cr = DataFrameFromImages()
# df = df_cr.create_dataframe()
# df_cr.train_test_folder(verbose=True)
df.head()

Unnamed: 0,img_name,label
0,handwritten-IMG_3700.jpg,0
1,bg_2.jpeg,0
2,efimov-page-105.png,0
3,filippov-page-6.png,0
4,filippov-page-34.png,1


In [167]:
print("Q:How many photos with diffeq? A:{}".format(df['label'].sum()))
print(len(df) - df['label'].sum())

Q:How many photos with diffeq? A:97
325


$\frac{1}{4}$ of dataset has differential equations in it.

I suppose, it's better to leave the equation undetected, than detect something that isn't a differential equation...

## 3. Train models

In [168]:
print(len(os.listdir("../data/images/train")))
print(len(os.listdir("../data/images/val")))

358
64


In [155]:
from ultralytics import YOLO

# model = YOLO(model='yolo')