In [7]:
import pandas as pd
import numpy as np
import ast  # For safely converting string to list
import csv
import matplotlib.pyplot as plt
from scipy.interpolate import splprep, splev
import uuid
import os
import json

In [None]:
# Open the file and process it manually
data = []
with open("../dataset/MOLHW_preprocess_unicode/big.txt", "r", encoding="utf-8") as file:
    reader = csv.reader(file)  # Read file line by line
    
    for row in reader:
        if not row:
            continue  # Skip empty lines
        
        first_5_columns = row[:5]  # First 5 columns
        last_column = ",".join(row[5:])  # Join remaining columns to reconstruct the coordinate array
        
        try:
            coordinates = ast.literal_eval(last_column)  # Convert string to list safely
        except Exception as e:
            print(f"Error parsing coordinates: {last_column} - {e}")
            coordinates = None  # Handle errors gracefully
        
        data.append(first_5_columns + [coordinates])
df = pd.DataFrame(data, columns=["label", "author",  "width",  "height", "density", "coordinates"])


In [None]:
# # Define file paths
# input_file = '../../dataset_preprocessing/MOLHW_preprocess_unicode/big.txt'  # Your large input file
# output_file = '../../dataset_preprocessing/MOLHW_preprocess_unicode/small.txt'  # File to save the first 1000 rows

# # Open the input file in read mode and output file in write mode
# with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
#     # Read the first 1000 lines
#     for i, line in enumerate(infile):
#         if i < 1000:
#             outfile.write(line)  # Write the line to the output file
#         else:
#             break  # Stop after 1000 lines

In [None]:
def plot_points(x):
    """
    Plots individual points from a list of coordinate points and connects neighboring points with lines,
    but avoids drawing lines across (-1, -1) separators.
    """

    coords = df.iloc[x]['coordinates'][1:]
    img_width, img_height = int(df.iloc[x]['width']), int(df.iloc[x]['height'])

    # Scale figure
    plt.figure(figsize=(img_width / 150, img_height / 150))  

    # Convert normalized coordinates to image dimensions, keeping (-1, -1) separators
    scaled_coords = [(x * img_width, y * img_height) if (x, y) != (-1, -1) else (-1, -1) for x, y in coords]

    # Separate segments
    segments = []
    current_segment = []

    for point in scaled_coords:
        if point == (-1, -1):  
            if current_segment:  
                segments.append(current_segment)
                current_segment = []
        else:
            current_segment.append(point)

    if current_segment:
        segments.append(current_segment)

    
    for i, segment in enumerate(segments):
        if len(segment) > 1:
            x_vals, y_vals = zip(*segment)
            plt.plot(x_vals, y_vals, color='black', linewidth=2, linestyle='-')  
        
           

    # Flip Y-axis to match image coordinates
    plt.xlim(0, img_width)
    plt.ylim(0, img_height)
    plt.gca().invert_yaxis()

    # Add grid with transparency
    plt.grid(True, linestyle="--", alpha=0.5)

    # Add title
    plt.title(df.iloc[x]['label'], fontsize=12, fontweight='bold')
    
    
    plt.show()

In [15]:
def smooth_segment(segment, smoothing_factor=0.5):
    if len(segment) < 3:
        return segment  # No need to smooth if only two points
    
    x_vals, y_vals = zip(*segment)
    
    # Create parameter values (t) for the points
    t = np.linspace(0, 1, len(segment))
    
    # Create a B-spline representation
    tck, u = splprep([x_vals, y_vals], s=smoothing_factor)
    
    # Generate more interpolated points
    u_fine = np.linspace(0, 1, len(segment) * 5)  # Increase points for smoother curve
    smoothed = splev(u_fine, tck)
    
    return list(zip(smoothed[0], smoothed[1]))
def plot_coordinates(coords, img_width, img_height, image_path, thickness = 2, sm_fac=0.2, padding=20):
    """
    Plots individual points from a list of coordinate points and connects neighboring points with lines,
    avoiding lines across (-1, -1) separators, with added padding.
    """
    # coords = df.iloc[x]['coordinates'][1:]
    # img_width, img_height = int(df.iloc[x]['width']), int(df.iloc[x]['height'])

    # Scale figure
    plt.figure(figsize=(img_width / 150, img_height / 150))

    # Convert normalized coordinates to image dimensions, keeping (-1, -1) separators
    scaled_coords = [(x * img_width, y * img_height) if (x, y) != (-1, -1) else (-1, -1) for x, y in coords]

    # Separate segments
    segments = []
    current_segment = []

    for point in scaled_coords:
        if point == (-1, -1):  
            if current_segment:  
                segments.append(current_segment)
                current_segment = []
        else:
            current_segment.append(point)

    if current_segment:
        segments.append(current_segment)

    for segment in segments:
        if len(segment) > 1:
            smoothed_segment = smooth_segment(segment, smoothing_factor=sm_fac)
            x_vals, y_vals = zip(*smoothed_segment)
            plt.plot(x_vals, y_vals, color='black', linewidth=thickness, linestyle='-')

    # Flip Y-axis to match image coordinates
    plt.xlim(-padding, img_width + padding)
    plt.ylim(img_height + padding, -padding)
    
    # Remove grid, title, and axes
    plt.axis('off')

    plt.savefig(image_path, dpi=150, bbox_inches="tight", pad_inches=0)
    plt.close()

In [16]:
x=3
coords = df.iloc[x]['coordinates'][1:]
img_width, img_height = int(df.iloc[x]['width']), int(df.iloc[x]['height'])
plot_coordinates(coords, img_width, img_height, "test_images/output1_image.png", sm_fac=0.2, thickness=5)

In [20]:
## generating database


data_path = "../../MOLHW/MOLHW_preprocess_unicode/MOLHW_preprocess_unicode.txt"

folder_path = "../../dataset/MOLHW_unicode"

data = []
s=0
with open(data_path, "r", encoding="utf-8") as file:
    reader = csv.reader(file)  # Read file line by line
    
    for row in reader:
        if not row:
            continue  # Skip empty lines
       
        s+=1
        if(s>10):
            break
        first_5_columns = row[:5]  # First 5 columns
        
        last_column = ",".join(row[5:])  # Join remaining columns to reconstruct the coordinate array
        
        try:
            coordinates = ast.literal_eval(last_column)  # Convert string to list safely
        except Exception as e:
            print(f"Error parsing coordinates: {last_column} - {e}")
            coordinates = None  # Handle errors gracefully
        image_name = f"{uuid.uuid4().hex}.png"
        image_path = folder_path + "/images/" + image_name
        plot_coordinates(coordinates, int(first_5_columns[2]), int(first_5_columns[3]), image_path, sm_fac=0.2, thickness=5, padding=100)

        data.append({
                        "image_path": image_path,
                        "label": first_5_columns[0],
                    })

json_path = os.path.join(folder_path, "data.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
# df = pd.DataFrame(data, columns=["label", "author",  "width",  "height", "density", "coordinates"])
s

11

In [18]:
first_5_columns

['ab', '1df87f3d797035f6f919ef8faeaa7b1a', '1080', '2265', '3.0']

In [4]:
df

Unnamed: 0,label,author,width,height,density,coordinates
0,ab,1df87f3d797035f6f919ef8faeaa7b1a,1080,2265,3.0,"[[-1, -1], [0.7425679968102712, 0.0], [0.75426..."
1,ab,2284c343bb9739ad6c124c1022e38ad3,1080,2160,3.0,"[[-1, -1], [0.33827970840834104, 0.0], [0.3577..."
2,ab,2284c343bb9739ad6c124c1022e38ad3,1080,2160,3.0,"[[-1, -1], [0.4377576951907115, 0.0], [0.45196..."
3,ab,a0fc0439c4b0bee243cc9c8cb934fa8b,720,1440,2.0,"[[-1, -1], [0.6360624108658747, 0.0], [0.65164..."
4,aba,37daadd776b8a6eff7f50548c44e7b56,1080,2163,2.55,"[[-1, -1], [0.8663801691206826, 0.0], [0.88342..."
5,aba,49fd1eaf7f0378b322dbeacbd61052db,1080,2120,3.0,"[[-1, -1], [0.6946794996892124, 0.0], [0.71024..."
6,aba,992cdbb96b5513444a83ab1fa15ccebd,720,1560,2.0,"[[-1, -1], [0.8071654420462637, 0.0], [0.77051..."
7,aba,c2fd1ee555fe2b1ca4918d1ee02765be,1080,2208,3.0,"[[-1, -1], [0.6437175016892851, 0.0], [0.67801..."
8,ab,ac778c7a0fe8764276d921ceffdfb9f8,1080,2163,2.55,"[[-1, -1], [0.784445357848207, 0.0], [0.742701..."
9,abaci,40a7c6be4666aa5f214b46175b2dfe35,1080,2265,3.0,"[[-1, -1], [0.6853081253618039, 0.0], [0.71453..."


In [44]:
len(data)

3