**Set up**

In [None]:
!apt-get install tesseract-ocr-ara
!pip install pytesseract
!pip install pillow

In [None]:
import pytesseract
from PIL import Image
import requests
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont,ImageFilter
import os
import textwrap
import pandas as pd

**Test ocr for arabic text**

In [None]:
image_url = 'https://andreasmhallberg.github.io/images/2017-03-03/utopia.png'
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))
text = pytesseract.image_to_string(img, lang='ara')
print(text)

**Load dataset**

In [None]:
morrocan_summarization_dataset = pd.read_csv("./darija19k.csv")

**Generate images from textual input and run OCR on created image**

In [None]:
def generate_image_with_text(text,save_dir,image_index ,image_width = 800, max_noise_level=100, blur_radius=1.6):
    background_color = (255, 255, 255)
    img = Image.new('RGB', (image_width, 1), background_color)
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype('./fonts/Noto_Naskh_Arabic/NotoNaskhArabic-VariableFont_wght.ttf', 15)
    text_color = (0, 0, 0) 
    wrapped_text = textwrap.fill(text, width=70)
    lines = wrapped_text.split('\n')
    total_height = 70 * len(lines)
    img = img.resize((image_width, total_height))
    draw = ImageDraw.Draw(img)
    y = 0
    for line in lines:
        draw.text((0, y), line, font=font, fill=text_color)
        y += 60
    img_blurred = img.filter(ImageFilter.GaussianBlur(radius=blur_radius))
    img_blurred.save(f"{save_dir}/{image_index}.png")
    ocr_text = pytesseract.image_to_string(img_blurred, lang='ara')
    return ocr_text


**Tesseract output postprocesing**

In [None]:
def remove_special_characters(text: str) -> str:
    """This functions removes the special character \n and any repetitions of it such as \n\n as well as remove
    characters such as \n\x0c"""
    return text.replace('\n', "").replace("\x0c", "")


**Data Generation**

In [None]:
directory = "./generated_images" 
if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
def generation_script(images_save_dir, summary_dataframe):
    new_df = pd.DataFrame({
        'original_text': summary_dataframe['Text'],
        'tesseract_output': summary_dataframe['Text'].apply(lambda x: remove_special_characters(generate_image_with_text(x, images_save_dir, summary_dataframe.index[summary_dataframe['Text'] == x][0]))),
        'summary': summary_dataframe['Summary'],
        'index': summary_dataframe.index
    })
    return new_df

In [None]:
generated_data_with_tesseract_ouput = generation_script(directory, morrocan_summarization_dataset)