In [None]:
import datetime
import requests,io
import io

import numpy as np
from fpdf import FPDF
from gtts import gTTS
import easyocr, torch, os
import language_tool_python
from google.cloud import vision
from matplotlib import pyplot as plt
from google.cloud.vision_v1.types import Image
from google.oauth2.service_account import Credentials
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [1]:
def detect_handwritten_text(path):
    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = Image(content=content)
    response = client.document_text_detection(image=image)
    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    
    page = response.full_text_annotation.pages[0]
    parapgraph_texts = ''
    for block in page.blocks:
        parapgraph_text = ''
        for paragraph in block.paragraphs:
            for word in paragraph.words:
                word_text = ''.join([
                    symbol.text for symbol in word.symbols
                ])
                parapgraph_text += word_text + ' '
        parapgraph_texts += parapgraph_text + ' '
    return parapgraph_texts

def detect_gantt_chart(path):
    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = Image(content=content)
    response = client.document_text_detection(image=image)
    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    
    page = response.full_text_annotation.pages[0]
    parapgraph_texts = []
    for block in page.blocks:
        parapgraph_text = ''
        for paragraph in block.paragraphs:
            for word in paragraph.words:
                word_text = ''.join([
                    symbol.text for symbol in word.symbols
                ])
                parapgraph_text += word_text + ' '
        parapgraph_texts.append(parapgraph_text)

    deadlines = parapgraph_texts[-4:]
    tasks = parapgraph_texts[1:-5]

    task_dict = {}
    for i in range(len(tasks)):
        task_dict[tasks[i].strip()] = deadlines[i].strip()
    return task_dict

In [2]:
def zero_shot_text_classification(corrected_text):
    try:
        tokenized_products = tokenizer.encode_plus(
                                                    corrected_text, 
                                                    return_tensors='pt', 
                                                    max_length=len(corrected_text.split()), 
                                                    pad_to_max_length=True
                                                    )
        preditions = nli_model(**tokenized_products)
        logits = preditions.logits
        probs = torch.softmax(logits, dim=-1)
        probs = probs.detach().numpy()
        probs = np.array(probs, dtype=np.object)
        probs = probs[:, 1]
        output = candidates[np.argmax(probs)]
    except:
        predition = classifier(corrected_text, candidates, multi_label=True)
        output = predition['labels'][np.argmax(predition['scores'])]
    return output

def write_to_pdf(text, task_type, pdf=None):
    pdf_args = pdf
    if pdf is None:
        pdf = FPDF()
    pdf.add_page()

    if pdf_args is None:
        pdf.set_font("Arial", size = 25)
        pdf.cell(
                200, 10, 
                txt = ' '.join(task_type.split('_')),
                ln = 1, 
                align = 'C'
                )
    
    text = text.split()
    pdf.set_font("Arial", size = 12)

    max_characters_per_line = 100
    curr_line = ''
    for i in range(len(text)):
        if len(curr_line) + len(text[i]) > max_characters_per_line:
            pdf.cell(200, 10, txt = curr_line, ln = 1, align = 'L')
            curr_line = text[i] + ' '
        else:
            curr_line += text[i] + ' '


    return pdf

def extract_single_image_content(image_path):
    text = detect_handwritten_text(image_path)
    task_type = zero_shot_text_classification(text)
    write_to_pdf(text, task_type)
    return text, task_type

def component_01_pipeline(
                        hand_written_image_paths,
                        gantt_chart_image_path,
                        digidoc_path = "store/ocr/digidoc.pdf",
                        read_out_loud_path = "store/ocr/read_out_loud.mp3"
                        ):
    texts = []
    task_types = []

    for image_path in hand_written_image_paths:
        text, task_type = extract_single_image_content(image_path)
        texts.append(text)
        task_types.append(task_type)

    mode_task_type = max(set(task_types), key=task_types.count)
    for idx, text in enumerate(texts):
        if idx == 0:
            pdf = write_to_pdf(text, mode_task_type)
        else:
            pdf = write_to_pdf(text, mode_task_type, pdf=pdf)
        
    pdf.output(digidoc_path)

    gantt_chart_dict = detect_gantt_chart(gantt_chart_image_path)
    deadline = gantt_chart_dict[mode_task_type]

    today = datetime.date.today()
    deadline = datetime.datetime.strptime(deadline, '%d/%m/%Y').date()
    daysleft = deadline - today

    readable_text = 'The Task Type for provided handwritten images is ' + ' '.join(mode_task_type.split('_')) + '.\n'
    for idx, text in enumerate(texts):
        readable_text += 'Reading Page ' + str(idx + 1) + ':\n'
        readable_text += text + '\n\n'

    readable_text += 'The deadline for the task is ' + deadline.strftime('%d/%m/%Y') + '.\n'
    readable_text += 'You have ' + str(daysleft.days) + ' days left to complete the task.\n'
    read_obj = gTTS(
                    text=readable_text, 
                    lang='en', 
                    slow=False
                    )
    read_obj.save(read_out_loud_path)

    return deadline.strftime('%d/%m/%Y'), daysleft.days, digidoc_path, read_out_loud_path

In [4]:
hand_written_image_paths = [
                            'data/ocr/HandWrittenData/1.jpg',
                            'data/ocr/HandWrittenData/2.jpg',
                            'data/ocr/HandWrittenData/3.jpg'
                            ]
gantt_chart_image_path = 'data/ocr/Gantt.jpg'
component_01_pipeline(hand_written_image_paths, gantt_chart_image_path)

NameError: name 'io' is not defined