In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re

In [None]:
# pip install Pillow

In [None]:
def text_to_dataframe(file_path):
    """
    Reads a text file line by line and puts each line into a pandas DataFrame.

    Args:
        file_path (str): The path to the text file.

    Returns:
        pandas.DataFrame: A DataFrame where each row corresponds to a line
                          from the text file. The DataFrame will have a single
                          column named 'text' containing the lines.
                          Returns None if the file cannot be opened.
    """
    try:
        with open(file_path, 'r') as file:
            tamil_lines = []
            for line in file:
                tamil_only = re.sub(r'[^\u0B80-\u0BFF\s]', ' ', line)
                tamil_only = ' '.join(tamil_only.split()) # Remove multiple spaces
                tamil_lines.append(tamil_only.strip())
        df = pd.DataFrame({'text': tamil_lines})
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
file_path = '/content/drive/MyDrive/ta.txt'

In [None]:
df = text_to_dataframe(file_path)

In [None]:
df.head()

Unnamed: 0,text
0,ரய
1,ச சப
2,படல்தலப்ப
3,ரயல் ரய
4,அனத்தபறவகள்இறக


In [None]:
df.head(20)

Unnamed: 0,text
0,ரய
1,ச சப
2,படல்தலப்ப
3,ரயல் ரய
4,அனத்தபறவகள்இறக
5,என்ன சய்ய அவர்கள்மகவம்நசக்க றன்அனத்த
6,நம்ச றந்தரதம்மற்ற ம்சரப்ப
7,அதனல்த ன்நம்கதல் தரவழவ ற்க
8,அனத்தமகவம்தளவகந ம்படமடய ம்சய்ய
9,சலச கசங்களபறக்க


In [None]:
len(df)

2437960

In [None]:
df_filtered = df[df['text'].str.len() > 40]

In [None]:
len(df_filtered)

222658

In [None]:
df_filtered.head(20)

Unnamed: 0,text
30,உங்களக்கதர யம் இந்தவ ட்டம ன்கள் நீங்கள் நல்ல இ...
55,நன்ப தகக்கப்படவதல் இந்தமந்தர சக்தத றயல் என்ற அ...
56,அதஎன்னநம்மவத்த அதனல் கடத்ததடன்மற்றம்சடனஇங்க
57,பதநீங்கள்தழர்களவளயஉள்ளன அங்கம டக்கம்உங்கள்
72,ஆமம் ந ன்என்னஅறமகம்மற்றம் அசத்தஎன்வல் இறககள்
83,நமதநம்பக்ககண்ட இரண்ட அவர்களக ப்பற்றஒன்றகதங்கள்...
105,கவலப்படவ ண்டம் தங்கள்இயற்க உணர்வகளஎப்பதம்எடத்த
123,எதவம்இல்லஇயற்கஇரப்பத பற்ற தக்கபத வழயல்அறமழவதம்
147,ஆனல்ந ன்சய்யமடயத நீங்கள்இந்த என்றல்அதஇல்லசய்யச...
150,நம்மீண்டம்வீட்டல்இரக்க ம் மன் ந ம்கட அதத ரயம்


In [None]:
df_filtered = df_filtered.sample(n=7000)

In [None]:
print(df_filtered.iloc[0]['text'])

இங்கச ர்ந்தமழப கப்படம எடத்தப்பதமக்கயம் இல்லய


In [None]:
df_filtered = df_filtered.reset_index(drop=True)

In [None]:
df_filtered.head()

Unnamed: 0,text
0,இங்கச ர்ந்தமழப கப்படம எடத்தப்பதமக்கயம் இல்லய
1,ஆண்ட உனக்கபதத இல்ல ஆனல்நீ இம்மறகற்றஞ்சட்டப்பட்...
2,பர்களம்தயர் எல்லரம்தய ர் கபத்தல்கந்தளக்கம்
3,அவர் எங்களடன்பணயற்றத்தடங்கம்பதஅவர்கள்சய்வர்கள்
4,நான் நாளை ஒரு சிகிச்சை வேண்டும் நீங்கள் அதை செ...


In [None]:
from PIL import Image, ImageDraw, ImageFont

In [None]:
def generate_images_with_proportional_fonts(df, font_paths, image_dir="images"):
    """
    Generates images from text records in a DataFrame, distributing the use of
    multiple fonts proportionally, with improved image quality and centered text
    using a fraction of image height.

    Args:
        df (pd.DataFrame): DataFrame containing the text data in a 'text' column.
        font_paths (list): List of paths to the font files (TTF or OTF).
        image_dir (str): Directory to save the generated images.
    """
    num_fonts = len(font_paths)
    num_records = len(df)

    if num_fonts == 0:
        print("Error: No font paths provided.")
        return

    # Calculate proportional distribution of records per font
    records_per_font = [num_records // num_fonts] * num_fonts
    remaining_records = num_records % num_fonts
    for i in range(remaining_records):
        records_per_font[i] += 1

    print(f"Records per font: {records_per_font}")

    font_index = 0
    record_count = 0

    for index, row in df.iterrows():
        tamil_text = row['text']
        print(tamil_text)
        font_path = font_paths[font_index]
        image_path = f"/content/drive/MyDrive/{image_dir}/tamil_image_{index}.png"

        try:
            # Load the font with a larger size
            font = ImageFont.truetype(font_path, size=80, layout_engine=ImageFont.Layout.RAQM)
            # break

            # Create a dummy image and drawing context
            dummy_image = Image.new("RGB", (1, 1), "white")
            draw = ImageDraw.Draw(dummy_image)

            # Get the text bounding box
            bbox = draw.textbbox((0, 0), tamil_text, font=font)
            text_width = bbox[2] - bbox[0]
            text_height = bbox[3] - bbox[1]

            # Calculate image size with margin
            image_width = int(text_width * 1.3)
            image_height = int(text_height * 1.3)
            image = Image.new("RGB", (image_width, image_height), "white")
            draw = ImageDraw.Draw(image)

            # Calculate text position using a fraction of image height
            x = (image_width - text_width) / 2
            y = (image_height - text_height) / 2 - (text_height * 0.05) #  Adjusted vertical positioning

            # if font_index == 0:
            #   y =

            # Draw the text
            draw.text(
                (x + text_width / 2, y + text_height / 2),
                tamil_text,
                fill="black",
                font=font,
                direction=None,
                anchor="mm"   # Use 'lt' (left-top) anchor for consistent positioning
            )

            # Save the image as PNG
            image.save(image_path, format="PNG")
            print(f"Image saved to {image_path} with font {font_path}")

        except FileNotFoundError:
            print(f"Error: Font file not found at {font_path}")
        except Exception as e:
            print(f"An error occurred: {e}")

        record_count += 1
        if record_count >= records_per_font[font_index]:
            record_count = 0
            font_index = (font_index + 1) % num_fonts


In [None]:
font_paths = [
    "/content/drive/MyDrive/fonts/Hind_Madurai/HindMadurai-Regular.ttf",
    "/content/drive/MyDrive/fonts/Noto_Serif_Tamil/NotoSerifTamil_Condensed-Regular.ttf",
    "/content/drive/MyDrive/fonts/Kavivanar/Kavivanar-Regular.ttf",
    "/content/drive/MyDrive/fonts/Noto_Sans_Tamil/NotoSansTamil-VariableFont_wdth,wght.ttf",
    "/content/drive/MyDrive/fonts/Pavanam/Pavanam-Regular.ttf",
    "/content/drive/MyDrive/fonts/Anek_Tamil/AnekTamil-VariableFont_wdth,wght.ttf",
]

In [None]:
generate_images_with_proportional_fonts(df_filtered, font_paths, image_dir="Text_images")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
எனக்கநரக்கமனசலர் அந கமகஇத இயல்பனநடத்தஇல்ல
Image saved to /content/drive/MyDrive/Text_images/tamil_image_4500.png with font /content/drive/MyDrive/fonts/Noto_Sans_Tamil/NotoSansTamil-VariableFont_wdth,wght.ttf
ஒரஒரபரத பமனவஷயம்என்னன்ன அத பர்க்கந மஅங்கஇரக்கமடயததத ன்
Image saved to /content/drive/MyDrive/Text_images/tamil_image_4501.png with font /content/drive/MyDrive/fonts/Noto_Sans_Tamil/NotoSansTamil-VariableFont_wdth,wght.ttf
நீ மரத்தவத்தற்கமகச் சறந்த பங்களப்பசய்தரக்கறய்
Image saved to /content/drive/MyDrive/Text_images/tamil_image_4502.png with font /content/drive/MyDrive/fonts/Noto_Sans_Tamil/NotoSansTamil-VariableFont_wdth,wght.ttf
நன் நீங்கள் சந்தஷமகஇரக்கறீர்கள்எனநம்ப கற ம்
Image saved to /content/drive/MyDrive/Text_images/tamil_image_4503.png with font /content/drive/MyDrive/fonts/Noto_Sans_Tamil/NotoSansTamil-VariableFont_wdth,wght.ttf
இந்த வகயனவர்த்தகங்கள்மற்றலம்கள்வக்க ரயவஅல்ல
Image saved to /content/drive/MyDriv

In [None]:
from datasets import Dataset, Image
from PIL import Image as PIL_Image
import os

In [None]:
def create_huggingface_dataset(df, image_dir, dataset_name="tamil_text_images"):
    """
    Creates a Hugging Face dataset from a pandas DataFrame and corresponding images.

    Args:
        df (pd.DataFrame): DataFrame containing the text data in a 'text' column.
        image_dir (str): Directory where the images are stored.
        dataset_name (str, optional): Name of the Hugging Face dataset.
            Defaults to "tamil_text_images".

    Returns:
        datasets.Dataset: The created Hugging Face dataset, or None on error.
    """
    images = []
    texts = []
    image_paths = []

    for index, row in df.iterrows():
        tamil_text = row['text']
        image_path = os.path.join(image_dir, f"tamil_image_{index}.png")

        if not os.path.exists(image_path):
            print(f"Warning: Image not found at {image_path}. Skipping.")
            continue

        try:
            # Open the image using PIL
            img = PIL_Image.open(image_path)
            images.append(img)  # Append the PIL Image object
            texts.append(tamil_text)
            image_paths.append(image_path) # keep track of the path
        except Exception as e:
            print(f"Error loading image at {image_path}: {e}. Skipping.")
            continue

    if not images:
        print("Error: No images were successfully loaded.")
        return None

    # Create a dictionary with the data
    data = {
        'image': images,
        'text': texts,
        # 'image_path': image_paths, # Added image paths
    }

    # Create the Hugging Face dataset
    dataset = Dataset.from_dict(data)

    # Convert the 'image' column to the Hugging Face Image feature.
    dataset = dataset.cast_column("image", Image())

    print(f"Dataset created with {len(dataset)} samples.")
    return dataset

In [None]:
image_dir = "/content/drive/MyDrive/Text_images"

In [None]:
dataset = create_huggingface_dataset(df_filtered, image_dir)

Dataset created with 7000 samples.


In [None]:
dataset

Dataset({
    features: ['image', 'text'],
    num_rows: 7000
})

In [None]:
def save_huggingface_dataset(dataset, save_dir, dataset_name="tamil_synthetic_ocr_f"):
    """
    Saves a Hugging Face dataset.

    Args:
        dataset (datasets.Dataset): The Hugging Face dataset to save.
        save_dir (str): Directory where the dataset should be saved.
        dataset_name (str, optional): Name of the dataset.  Defaults to
            "tamil_text_images".
    """
    try:
        # Save the dataset
        dataset.save_to_disk(os.path.join(save_dir, dataset_name))
        print(f"Dataset saved to {os.path.join(save_dir, dataset_name)}")
    except Exception as e:
        print(f"Error saving dataset: {e}")

In [None]:
if dataset is not None:
    save_dir = "/content/drive/MyDrive/"  # Replace with the directory where you want to save the dataset
    save_huggingface_dataset(dataset, save_dir)

Saving the dataset (0/1 shards):   0%|          | 0/7000 [00:00<?, ? examples/s]

Dataset saved to /content/drive/MyDrive/tamil_synthetic_ocr_f


In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("Nevidu/tamil_synthetic_ocr")

Generating data split: 7000 examples [00:00, 29087.17 examples/s]


In [None]:
dataset

DatasetDict({
    data: Dataset({
        features: ['image', 'text'],
        num_rows: 7000
    })
})

In [None]:
# Access the first example
first_example = dataset['data'][0] # Assuming a 'train' split

# Get the image and text
image = first_example['image']
text = first_example['text']

print(f"Text: {text}")
# image.show() # To display the image (requires libraries like Pillow)

Text: பரச்சனகளதீர்த்தவடவம்என்ற நம்பஎங்கள்வண்கலத்தவடவமத்தம்
