In [32]:
from PIL import Image
import pytesseract

def image_to_text(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

if __name__ == '__main__':
    text = image_to_text('TestQuestion.png')
    print(text)

A company has three manufacturing plants, and company officials want to determine
whether there is a difference in the average age of workers at the three locations. The
following data are the ages of five randomly selected workers at each plant.

Plant (Employee Ages)

1 2 3

29 32 25
27 33 24
30 31 24
2 34 25
28 30 26

Perform a one-way ANOVA to determine whether there is a significant difference in the
mean ages of the workers at the three plants. Use « = 0.01 and note that the sample sizes
are equal.



In [33]:
import pandas as pd
import re


def extract_table(text, start_keyword, end_keyword):
    lines = text.split('\n')

    table = []
    non_table_text = []

    in_table = False

    for line in lines:
        if start_keyword in line:
            in_table = True
        elif end_keyword in line:
            in_table = False

        if in_table and line.strip():
            table.append(line)
        else:
            non_table_text.append(line)

    if table:
        header = table.pop(0)
        header = header.split()
        table = [line.split() for line in table]
        table = pd.DataFrame(table, columns=header)
    else:
        header = []
        table = pd.DataFrame()

    return table, '\n'.join(non_table_text)


table, non_table_text = extract_table(
    text, "Plant (Employee Ages)", "Perform a one-way ANOVA")
print("Table:")
print(table)
print("\nNon-table text:")
print(non_table_text)

Table:
  Plant (Employee Ages)
0     1         2     3
1    29        32    25
2    27        33    24
3    30        31    24
4     2        34    25
5    28        30    26

Non-table text:
A company has three manufacturing plants, and company officials want to determine
whether there is a difference in the average age of workers at the three locations. The
following data are the ages of five randomly selected workers at each plant.




Perform a one-way ANOVA to determine whether there is a significant difference in the
mean ages of the workers at the three plants. Use « = 0.01 and note that the sample sizes
are equal.



In [12]:
import cv2
import pytesseract
import numpy as np
import pandas as pd


def select_table_region(image_path):
    image = cv2.imread(image_path)
    image = cv2.resize(image, (1000, 500))
    clone = image.copy()
    table_roi = []

    def select_roi(event, x, y, flags, param):
        nonlocal table_roi, clone
        if event == cv2.EVENT_LBUTTONDOWN:
            table_roi = [(x, y)]
        elif event == cv2.EVENT_LBUTTONUP:
            table_roi.append((x, y))
            cv2.rectangle(clone, table_roi[0], table_roi[1], (0, 255, 0), 2)
            cv2.imshow("image", clone)

    cv2.namedWindow("image")
    cv2.setMouseCallback("image", select_roi)
    while True:
        cv2.imshow("image", clone)
        key = cv2.waitKey(1) & 0xFF
        if key == ord("r"):
            clone = image.copy()
        elif key == ord("c"):
            break
    cv2.destroyAllWindows()
    return table_roi


def extract_text_from_image(image):
    custom_config = r'--oem 3 --psm 6'
    return pytesseract.image_to_string(image, config=custom_config)


def process_image(image_path, table_roi):
    image = cv2.imread(image_path)
    table_image = image[table_roi[0][1]:table_roi[1]
                        [1], table_roi[0][0]:table_roi[1][0]]

    mask = np.ones(image.shape[:2], dtype="uint8") * 255
    cv2.rectangle(mask, table_roi[0], table_roi[1], 0, -1)
    text_image = cv2.bitwise_and(image, image, mask=mask)

    remaining_text = extract_text_from_image(text_image)

    table_text = extract_text_from_image(table_image)
    rows = [row.split() for row in table_text.split('\n') if row.strip()]
    try:
        table_df = pd.DataFrame(rows[1:], columns=rows[0])
    except ValueError as ve:
        print("Error in DataFrame creation:", ve)
        table_df = pd.DataFrame(rows)

    return remaining_text, table_df


image_path = './TestQuestion.jpg'
table_roi = select_table_region(image_path)
remaining_text, table_df = process_image(image_path, table_roi)
print("Extracted Text:", remaining_text)
print("Extracted Table:")
print(table_df)

Extracted Text: PROBLEM ! cS as
 eilowing table gives the strength of concrete made with sand containing alterent
€ concrete was made into four cylinders which were
s for this five population given in the following table.
i es Strength (x) : :
| 4 ne ra SS a a Nee IE
ee i 1650 1580
| 0.05 420() 4 ae —- maa
eet SNS 1550 1445 a Ae a eee
| 3 ce oo fe 1645. 1545500
ee ae baodat aE 1625 1450 1510 eee
ee 0.5 fo Ge ea ga aps a 1590 2 ee
fee emi Es 1445
ees nee eee
ee ee A520 3

Extracted Table:
           SIE   BS        ean   men    asin   ohn    oe   age
0  percentages   of     coals.  Each  sample     ¢  None  None
1       tasted  for  strength.  None    None  None  None  None
2           Sg   yi         te    OB      ae   Cal   sm.   0/.


In [1]:
import cv2
import pytesseract
import numpy as np
import pandas as pd


def select_table_region(image_path):
    image = cv2.imread(image_path)
    clone = image.copy()
    table_roi = []

    def select_roi(event, x, y, flags, param):
        nonlocal table_roi, clone
        if event == cv2.EVENT_LBUTTONDOWN:
            table_roi = [(x, y)]
        elif event == cv2.EVENT_LBUTTONUP:
            table_roi.append((x, y))
            cv2.rectangle(clone, table_roi[0], table_roi[1], (0, 255, 0), 2)
            cv2.imshow("image", clone)

    cv2.namedWindow("image")
    cv2.setMouseCallback("image", select_roi)
    while True:
        cv2.imshow("image", clone)
        key = cv2.waitKey(1) & 0xFF
        if key == ord("r"):
            clone = image.copy()
        elif key == ord("c"):
            break
    cv2.destroyAllWindows()
    return table_roi


def extract_text_from_image(image):
    custom_config = r'--oem 3 --psm 6'
    return pytesseract.image_to_string(image, config=custom_config)


def preprocess_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
    dilated = cv2.dilate(thresh, kernel, iterations=1)
    contours, _ = cv2.findContours(
        dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    for contour in contours:
        [x, y, w, h] = cv2.boundingRect(contour)
        if h < 40 or w < 40:
            continue
        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
    return image


def process_image(image_path, table_roi):
    image = cv2.imread(image_path)
    image = preprocess_image(image)
    table_image = image[table_roi[0][1]:table_roi[1]
                        [1], table_roi[0][0]:table_roi[1][0]]

    mask = np.ones(image.shape[:2], dtype="uint8") * 255
    cv2.rectangle(mask, table_roi[0], table_roi[1], 0, -1)
    text_image = cv2.bitwise_and(image, image, mask=mask)

    remaining_text = extract_text_from_image(text_image)

    table_text = extract_text_from_image(table_image)
    rows = [row.split() for row in table_text.split('\n') if row.strip()]
    try:
        table_df = pd.DataFrame(rows[1:], columns=rows[0])
    except ValueError as ve:
        print("Error in DataFrame creation:", ve)
        table_df = pd.DataFrame(rows)

    return remaining_text, table_df


image_path = './TestQuestion.png'
table_roi = select_table_region(image_path)
remaining_text, table_df = process_image(image_path, table_roi)
print("Extracted Text:", remaining_text)
print("Extracted Table:")
print(table_df)

KeyboardInterrupt: 