In [12]:
import pytesseract as pt
import easyocr
import cv2
import yaml

from PIL import Image, ImageEnhance
import platform

from gtts import gTTS

import azure.cognitiveservices.speech as speechsdk
import json


In [5]:
# Open an image file
with Image.open('../text_images/pfkm-6.png') as img:
    # Create an ImageEnhance.Contrast object
    enhancer = ImageEnhance.Contrast(img)
    
    # Increase the contrast of the image
    img_edit = enhancer.enhance(0.8)  # Increase contrast. The factor 2.0 means that the contrast will be doubled.
    
    # Save the edited image
    img_edit.save('../text_images/pfkm-6-enh.png')

In [6]:
image_file = '../text_images/pfkm-6.png'

### Tesseract

In [7]:
if platform.system() == 'Darwin':  # macOS
    pt.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.4/bin/tesseract'
elif platform.system() == 'Windows':
    pt.pytesseract.tesseract_cmd = r"C:/Program Files/Tesseract-OCR/tesseract.exe"
else:
    print('Operating system not supported')

In [8]:
# get text from image - pytesseract
image = Image.open(image_file)
text = pt.image_to_string(image)
# boxes = pt.image_to_boxes(image)
data = pt.image_to_data(image, output_type=pt.Output.DICT)
osd = pt.image_to_osd(image, output_type=pt.Output.DICT)
print(text)

Anevia: "I lay low in the temple for the next few years, kept my head down. I washed floors, fetched water,
listened to sermons. Funny thing, after a while I started liking Desna's teachings. But as soon as I was old
enough, I was outta there — I left Nidal and got as far away as I could."
1. "Where are you from?"
. "Quite a ragtag group you've got here — from nobles to street thieves!"
. "How did you meet Irabeth?"
. "How did you and Irabeth end up in Kenabres?"
. "What is it like, living with Irabeth?"
. "Thank you for your answ:

anv fF Wb




In [16]:
print(osd)

{'page_num': 0, 'orientation': 0, 'rotate': 0, 'orientation_conf': 16.69, 'script': 'Latin', 'script_conf': 2.68}


In [35]:
img = cv2.imread(image_file)
n_boxes = len(data['text'])
for i in range(n_boxes):
    if int(data['conf'][i]) > 60:
        (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
        img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

cv2.imshow('img', img)
cv2.waitKey(0)

-1

### EasyOCR

In [12]:
# get text from image - easyocr

reader = easyocr.Reader(['en'])


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [13]:
result = reader.readtext(image_file, detail=0, paragraph=True)

In [14]:
result

['Anevia: "I\'m from Nidal. And I wouldn wish my homeland on my worst enemy: You\'ve heard of the place, Im sure. Ruled by monsters that aren\'t alive nor dead, and the official religion is the cult of Zon-Kuthon. grew up in a slum, like a weed between the cobblestones_ I didn\'t have a dad; but I had lots of aunts and uncles_ my mom cronies. No prize for guessing the kind of business she was involved in. They gave me set of lockpicks as soon as [ could hold a spoon, and while other kids were picking their noses, was picking pockets:',
 'sc',
 'OeaHot',
 'KERN',
 'CONTINUE',
 'ANEVIA',
 '138']

### OCRopus

### Craft text detector

### PaddleOCR

# Google TTS (gTTS) library

In [14]:
# convert text to speech - gTTS
tts = gTTS(text, lang='en')
tts.save('../audio_output/gtts-pfkm-6.mp3')

# Azure AI Speech SDK TTS

In [9]:
text = text.replace('\n', ' ')
print(text)

Anevia: "I lay low in the temple for the next few years, kept my head down. I washed floors, fetched water, listened to sermons. Funny thing, after a while I started liking Desna's teachings. But as soon as I was old enough, I was outta there — I left Nidal and got as far away as I could." 1. "Where are you from?" . "Quite a ragtag group you've got here — from nobles to street thieves!" . "How did you meet Irabeth?" . "How did you and Irabeth end up in Kenabres?" . "What is it like, living with Irabeth?" . "Thank you for your answ:  anv fF Wb  


In [10]:
with open('../configs/secrets.yaml', 'r') as file:
    secrets = yaml.safe_load(file)

In [11]:
# convert text to speech - Azure AI Speech

# Creates an instance of a speech config with specified subscription key and service region.
speech_key = secrets['azure']['subscription_key']
service_region = secrets['azure']['service_region']

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Note: the voice setting will not overwrite the voice element in input SSML.
speech_config.speech_synthesis_voice_name = "en-GB-RyanNeural"

audio_config = speechsdk.audio.AudioOutputConfig(filename="../audio_output/ai-pfkm-6.wav")

# use the default speaker as audio output.
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

result = speech_synthesizer.speak_text_async(text).get()
# Check result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized for text [{}]".format(text))
elif result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))



Speech synthesized for text [Anevia: "I lay low in the temple for the next few years, kept my head down. I washed floors, fetched water, listened to sermons. Funny thing, after a while I started liking Desna's teachings. But as soon as I was old enough, I was outta there — I left Nidal and got as far away as I could." 1. "Where are you from?" . "Quite a ragtag group you've got here — from nobles to street thieves!" . "How did you meet Irabeth?" . "How did you and Irabeth end up in Kenabres?" . "What is it like, living with Irabeth?" . "Thank you for your answ:  anv fF Wb  ]


In [30]:
# API version

import requests
import json

subscription_key = secrets['azure']['subscription_key']
service_region = secrets['azure']['service_region']
tts_service_url = f'https://{service_region}.tts.speech.microsoft.com/cognitiveservices/v1'

fetch_token_url = f'https://{service_region}.api.cognitive.microsoft.com/sts/v1.0/issueToken'

headers = {
    'Ocp-Apim-Subscription-Key': subscription_key,
    'Content-Type': 'application/ssml+xml',
    'X-Microsoft-OutputFormat': 'riff-24khz-16bit-mono-pcm'
}

token_response = requests.post(fetch_token_url, headers={'Ocp-Apim-Subscription-Key': subscription_key})
access_token = str(token_response.text)

# Update the headers to use the token
headers = {
    'Authorization': 'Bearer ' + access_token,
    'Content-Type': 'application/ssml+xml',
    'X-Microsoft-OutputFormat': 'riff-24khz-16bit-mono-pcm'
}

ssml_text = f"""
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
    <voice name='en-US-AriaNeural'>Hello friend. We are going on an adventure. Buckle up!</voice>
</speak>
"""


response = requests.post(tts_service_url, headers=headers, data=ssml_text)

if response.status_code == 200:
    with open('../audio_output/output_audio.wav', 'wb') as audio_file:
        audio_file.write(response.content)
    print("Audio saved to 'output_audio.wav'.")
else:
    print(f"Error: {response.status_code} - {response.reason}")


Audio saved to 'output_audio.wav'.
