In [4]:
import json
import os
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
from PIL import Image


In [5]:
def load_json_data(json_file):
    with open(json_file, 'r') as f:
        return json.load(f)

def extract_features(json_data, image_folder):
    features = []

    image_path = os.path.join(image_folder, json_data['image_name'])
    image = Image.open(image_path)
    image_width, image_height = image.size

    for item in json_data['ocr_data']:
        x, y, width, height = item['x'], item['y'], item['width'], item['height']
        text = item['text']

        # Relative length w.r.t image width and height
        relative_length_width = width / image_width
        relative_length_height = height / image_height

        # Slope calculation using coordinates
        slope = math.atan2(y + height, x + width)

        features.append({
            'text': text,
            'x': x,
            'y': y,
            'width': width,
            'height': height,
            'relative_length_width': relative_length_width,
            'relative_length_height': relative_length_height,
            'slope': slope
        })

    return features


In [7]:
def load_json_data(json_file):
    with open(json_file, 'r') as f:
        return json.load(f)

def extract_features(json_data, image_folder, json_file_name):
    features = []

    # Construct the image name from the JSON file name
    image_name = json_file_name.replace('.json', '') + '.png'  # Assuming images are in PNG format
    image_path = os.path.join(image_folder, image_name)

    try:
        image = Image.open(image_path)
    except FileNotFoundError:
        print(f"Image not found: {image_path}")
        return features

    image_width, image_height = image.size

    for item in json_data['ocr_data']:
        x, y, width, height = item['x'], item['y'], item['width'], item['height']
        text = item['text']

        # Relative length w.r.t image width and height
        relative_length_width = width / image_width
        relative_length_height = height / image_height

        # Slope calculation using coordinates
        slope = math.atan2(y + height, x + width)

        features.append({
            'text': text,
            'x': x,
            'y': y,
            'width': width,
            'height': height,
            'relative_length_width': relative_length_width,
            'relative_length_height': relative_length_height,
            'slope': slope
        })

    return features

def process_json_files(json_folder, image_folder):
    all_features = []

    for json_file in tqdm(os.listdir(json_folder)):
        if json_file.endswith('.json'):
            json_path = os.path.join(json_folder, json_file)
            json_data = load_json_data(json_path)

            # Extract features
            features = extract_features(json_data, image_folder, json_file)
            for feature in features:
                feature['document_name'] = json_file
                all_features.append(feature)

    return pd.DataFrame(all_features)

json_folder = '.'  # Base folder containing JSON files
image_folder = './images'  # Folder containing images
data = process_json_files(json_folder, image_folder)


100%|██████████████████████████████████████████| 6/6 [00:00<00:00, 1556.04it/s]

Image not found: ./images\filtered-data.png





In [16]:
import json
import os
import pandas as pd
import math
from tqdm import tqdm
from PIL import Image

def load_json_data(json_file):
    with open(json_file, 'r') as f:
        return json.load(f)

def extract_features(json_data, image_folder):
    features = []

    # Iterate through each item in the JSON data
    for key, data in json_data.items():
        # Ensure the 'path' key exists in the JSON data
        if 'path' not in data:
            print(f"'path' key not found in JSON entry: {key}")
            continue

        # Extract the image path from the JSON data
        image_name = data["path"] + '.jpeg'  # Assuming images are in PNG format
        image_path = os.path.join(image_folder, image_name)

        try:
            image = Image.open(image_path)
        except FileNotFoundError:
            print(f"Image not found: {image_path}")
            continue

        image_width, image_height = image.size

        # Process each word in the OCR data
        for word, coordinates in data.get('ocr', {}).items():
            # Each word has a list of coordinate points defining its bounding box
            if len(coordinates) != 4:
                print(f"Unexpected number of coordinates for word: {word}")
                continue

            # Extract the bounding box coordinates
            x_coords = [coord['x'] for coord in coordinates]
            y_coords = [coord['y'] for coord in coordinates]

            # Calculate the bounding box dimensions
            min_x, max_x = min(x_coords), max(x_coords)
            min_y, max_y = min(y_coords), max(y_coords)
            width = max_x - min_x
            height = max_y - min_y

            # Relative length w.r.t image width and height
            relative_length_width = width
            relative_length_height = height

            # Slope calculation using top-left and bottom-right coordinates
            slope = math.atan2(max_y - min_y, max_x - min_x)

            features.append({
                'word': word,
                'x': min_x,
                'y': min_y,
                'width': width,
                'height': height,
                'relative_length_width': relative_length_width,
                'relative_length_height': relative_length_height,
                'slope': slope
            })

    return features

def process_json_files(json_folder, image_folder):
    all_features = []

    for json_file in tqdm(os.listdir(json_folder)):
        if json_file.endswith('.json'):
            json_path = os.path.join(json_folder, json_file)
            json_data = load_json_data(json_path)

            # Extract features
            features = extract_features(json_data, image_folder)
            for feature in features:
                feature['document_name'] = json_file
                all_features.append(feature)

    return pd.DataFrame(all_features)

# Define the paths to the JSON and images folders
json_folder = '.'  # Base folder containing JSON files
image_folder = 'images/'  # Folder containing images

# Process the JSON files and extract features
data = process_json_files(json_folder, image_folder)

# Save the extracted features to a CSV file
output_csv = 'output_features.csv'
data.to_csv(output_csv, index=False)
print(f"CSV file saved at {output_csv}")


100%|███████████████████████████████████████████| 7/7 [00:00<00:00, 132.13it/s]

CSV file saved at output_features.csv



