# Image Magick Code:

```
magick -density 300 input.pdf -background white -alpha remove -alpha off output_%03d.jpg
```

In [1]:
import cv2 as cv
import numpy as np
import os
from PIL import Image
import pandas as pd

# Question Extraction

In [30]:
def searchImg(haystack_img, needle_img, threshold):
    result = cv.matchTemplate(haystack_img, needle_img, cv.TM_CCOEFF_NORMED)

    threshold = 0.7
    locations = np.array(np.where(result >= threshold))

    # Filtering for when it gets the same square multiple times
    height = needle_img.shape[0]
    diffs = np.diff(locations[0])
    indices = np.where(diffs > height)[0]

    # No matches were found
    if len(indices) == 0:
        return None

    selected_indices = np.concatenate(([0], indices+1))
    locations = locations[:, selected_indices]

    locations = list(zip(*locations[::-1]))

    return np.array(locations)

def getQuestionRegions(haystack, locations_black_square, locations_question):
    haystack_width, haystack_height = haystack.shape[1], haystack.shape[0]
    regions = []

    for loc_square in locations_black_square:
        for loc_quest in locations_question:
            if loc_quest[1] > loc_square[1]:
                regions.append([loc_square, loc_quest])
                break

    # Adding from the last question to the end of the page
    regions.append([locations_black_square[-1], np.array([haystack_width, haystack_height])])
    
    return regions

def trimWhitespace(image):
    # Convert to grayscale
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)

    # Apply a binary threshold to detect non-white regions
    _, thresh = cv.threshold(gray, 240, 255, cv.THRESH_BINARY_INV)

    # Find contours of the non-white regions
    contours, _ = cv.findContours(thresh, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)

    if contours:
        # Get bounding box of the largest contour
        x, y, w, h = cv.boundingRect(np.concatenate(contours))

        # Crop the image using the bounding box
        cropped = image[y:y+h, x:x+w]
        return cropped

def extractRegions(regions, haystack_img, needle_width, needle_height):
    haystack_width = haystack_img.shape[1]
    
    extracted_regions = []

    c = 0
    for reg_start, reg_end in regions:
        # top_left = (reg_start[0] + int(1.1*needle_width), reg_start[1] - 4*needle_height//5)
        top_left = (reg_start[0], reg_start[1] - 4*needle_height//5)
        bottom_right = (reg_start[0] + 3*haystack_width//4, reg_end[1] - needle_height//2)


        # Extract the region of interest (ROI) from the original image
        region_ = trimWhitespace(haystack_img[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0]])
        
        extracted_regions.append(region_)
        
    return extracted_regions

def getWhiteIntervals(img):
    # Convert to grayscale
    img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)

    # Apply a binary threshold to detect non-white regions
    _, img_thr = cv.threshold(img, 240, 255, cv.THRESH_BINARY_INV)

    intervals = []

    start = None
    end = None

    for c, line in enumerate(img_thr):

        if start is None:
            if np.sum(line) == 0:
                    start = c
        elif end is None:
            if np.sum(line) != 0:
                end = c-1
                intervals.append([start, end])
                start = None
                end = None

    return intervals

def splitLines(img):
    needle_height = img.shape[0]
    needle_width = img.shape[1]
    
    intervals = getWhiteIntervals(img)
    intervals.insert(0, [-1, 0])
    intervals.append([needle_height, -1])

    quest_intervals = [[intervals[inter][1], intervals[inter+1][0]] for inter in range(len(intervals)-1)]

    # Get only the 5 greatest intervals
    if len(quest_intervals) > 5:
        intervals_size = []
        for interval_ in quest_intervals:
            diff = abs(interval_[0] - interval_[1])
            intervals_size.append(diff)
        
        intervals_size = np.array(intervals_size)
        intervals_size = np.argsort(intervals_size)[-5:][::-1]

        new_intervals = []

        for c, interval_ in enumerate(quest_intervals):
            if (c==0) or (c in intervals_size): 
                new_intervals.append(interval_)
            else:
                new_intervals[-1][1] = interval_[1]   
        quest_intervals = new_intervals

    splitted_img = []

    for inter in quest_intervals:
        img_ = img[inter[0]:inter[1], 0:needle_width]
        img_ = trimWhitespace(img_)
        splitted_img.append(img_)

    return splitted_img

import numpy as np
import cv2 as cv

def getVerticalWhiteIntervals(img, threshold=1):
    # Convert to grayscale
    img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)

    # Apply a binary threshold to detect non-white regions
    _, img_thr = cv.threshold(img, 240, 255, cv.THRESH_BINARY_INV)

    intervals = []

    start = None
    width = img_thr.shape[1]

    # Iterate over columns
    for col in range(width):
        column = img_thr[:, col]
        if start is None:
            if np.sum(column) == 0:
                start = col
        else:
            if np.sum(column) != 0:
                end = col - 1
                if (end - start + 1) >= threshold:
                    intervals.append([start, end])
                start = None

    # Handle last interval
    if start is not None:
        end = width - 1
        if (end - start + 1) >= threshold:
            intervals.append([start, end])

    return intervals

def splitVerticalLines(img, threshold=1):
    height, width = img.shape[:2]
    
    intervals = getVerticalWhiteIntervals(img, threshold=threshold)
    intervals.insert(0, [-1, 0])
    intervals.append([width, -1])

    split_intervals = [[intervals[i][1], intervals[i+1][0]] for i in range(len(intervals) - 1)]

    # Keep only the 5 widest intervals
    if len(split_intervals) > 5:
        sizes = [abs(end - start) for start, end in split_intervals]
        largest_indices = np.argsort(sizes)[-5:][::-1]

        new_intervals = []
        for i, interval in enumerate(split_intervals):
            if (i == 0) or (i in largest_indices):
                new_intervals.append(interval)
            else:
                new_intervals[-1][1] = interval[1]
        split_intervals = new_intervals

    split_images = []
    for start, end in split_intervals:
        sub_img = img[:, start:end]
        sub_img = trimWhitespace(sub_img)
        split_images.append(sub_img)

    return split_images

def splitGridLines(img, threshold=100):
    splitted_lines = splitVerticalLines(img, threshold)

    all_lines = []
    for line in splitted_lines:
        horizontal = splitLines(line)
        all_lines += horizontal

    return all_lines

def processQuestions(img, needle_width, pad=15):
    white_strip = np.full((pad, img.shape[1], 3), 255, dtype=np.uint8)
    new_image = np.vstack((img, white_strip))
    return new_image[:, int(1.1*needle_width):] 

# def rewritePage(haystack, regions, quest_items, height_step=50):
#     haystack_width = haystack.shape[1]
#     positions = []
#     for region, quest in zip(regions, quest_items):
#         haystack[region[0][1]-height_step:region[1][1], region[0][0]:haystack_width] = (255, 255, 255)
        
#         positions_list = [i for i in range(len(quest))]
#         paired_list = list(zip(quest, positions_list))
#         np.random.shuffle(paired_list)
#         shuffle_quest, positions_list = zip(*paired_list)
#         positions.append(list(positions_list))

#         y_offset = region[0][1]
#         for c, quest_ in enumerate(shuffle_quest):
#             h, w = quest_.shape[:2]

#             haystack[y_offset:y_offset+h, region[0][0]:region[0][0]+w] = quest_
#             y_offset += h

#     return haystack, positions

def rewritePage(haystack, regions, quest_items, height_step=50, padding=5):
    haystack_height, haystack_width = haystack.shape[:2]
    positions = []

    for region, quest in zip(regions, quest_items):
        # Clear the region to white
        haystack[region[0][1]-height_step:region[1][1], region[0][0]:haystack_width] = (255, 255, 255)

        # Shuffle the quest items
        indices = list(range(len(quest)))
        paired = list(zip(quest, indices))
        np.random.shuffle(paired)
        shuffled_quest, indices = zip(*paired)
        positions.append(list(indices))

        x_start = region[0][0]
        y_start = region[0][1]

        current_x = x_start
        current_y = y_start
        max_line_height = 0

        for item in shuffled_quest:
            h, w = item.shape[:2]

            # Check if item fits in the current row
            if current_x + w > haystack_width:
                # Wrap to next line
                current_x = x_start
                current_y += max_line_height + padding
                max_line_height = 0

            # Check if item fits vertically on the haystack
            if current_y + h > haystack_height:
                print("Warning: Not enough space to paste more items in the current region.")
                break

            # Paste the item
            haystack[current_y:current_y+h, current_x:current_x+w] = item

            # Update positions
            current_x += w + padding
            max_line_height = max(max_line_height, h)

    return haystack, positions
def rewritePage(haystack, regions, quest_items, height_step=50, padding=20, margin=300):
    haystack_height, haystack_width = haystack.shape[:2]
    positions = []

    for region, quest in zip(regions, quest_items):
        # Clear the region to white
        haystack[region[0][1]-height_step:region[1][1], region[0][0]:haystack_width] = (255, 255, 255)

        # Shuffle the quest items
        indices = list(range(len(quest)))
        paired = list(zip(quest, indices))
        np.random.shuffle(paired)
        shuffled_quest, indices = zip(*paired)
        positions.append(list(indices))

        x_start = region[0][0]
        y_start = region[0][1] - height_step

        current_x = x_start
        current_y = y_start
        max_line_height = 0

        for item in shuffled_quest:
            h, w = item.shape[:2]

            # Check if item fits in the current row
            if current_x + w > (haystack_width-margin):
                # Wrap to next line
                current_x = x_start
                current_y += max_line_height + padding
                max_line_height = 0

            # Check if item fits vertically on the haystack
            if current_y + h > haystack_height:
                print("Warning: Not enough space to paste more items in the current region.")
                break

            # Paste the item
            haystack[current_y:current_y+h, current_x:current_x+w] = item

            # Update positions
            current_x += w + padding
            max_line_height = max(max_line_height, h)

    return haystack, positions


In [33]:
needle_img = cv.imread("data/Black_square_new.jpg", cv.IMREAD_COLOR)
needle_height = needle_img.shape[0]
needle_width = needle_img.shape[1]

questao_img = cv.imread("data/2024-Questao.jpg", cv.IMREAD_UNCHANGED)
questao_img = cv.cvtColor(questao_img, cv.COLOR_BGRA2BGR)


total_pages = len(os.listdir("data/splitted_pages-2024-1/"))
new_pages = []
new_positions = []

for n_page in range(0, 23):
    haystack_img = cv.imread(f"data/splitted_pages-2024-1/output_{n_page:03}.jpg", cv.IMREAD_COLOR)
    print(f"data/splitted_pages-2024-1/output_{n_page:03}.jpg")
    haystack_width = haystack_img.shape[1]
    haystack_height = haystack_img.shape[0]

    # searchImg returns None if no match was found
    locations_black_square = searchImg(haystack_img, needle_img, 0.7)
    if not isinstance(locations_black_square, np.ndarray):
        new_pages.append(haystack_img)
        new_positions.append(None)
        continue
    
    locations_questao = searchImg(haystack_img, questao_img, 0.7)
    if not isinstance(locations_questao, np.ndarray):
        new_pages.append(haystack_img)
        new_positions.append(None)
        continue

    # Getting the regions of interest
    regions = getQuestionRegions(haystack_img, locations_black_square, locations_questao)

    # Getting the actual images from those regions
    regions_extracted = extractRegions(regions, haystack_img, needle_width, needle_height)
    # for c, extracted_ in enumerate(regions_extracted):
    #     cv.imwrite(f"test-{c}.jpg", extracted_)

    # Getting each item splitted from the questions found
    quest_items = []
    for region in regions_extracted:
        quest_items.append(splitGridLines(region))

    trimmed_quest_items = []
    for quest_item_ in quest_items:
        imgs = []
        for img in quest_item_:
            imgs.append(processQuestions(img, needle_width))
        trimmed_quest_items.append(imgs)

    # Rewriting the page with the new random items
    new_haystack = haystack_img.copy()
    page, positions = rewritePage(new_haystack, regions, trimmed_quest_items)

    new_pages.append(page)
    new_positions.append(positions)

data/splitted_pages-2024-1/output_000.jpg
data/splitted_pages-2024-1/output_001.jpg
data/splitted_pages-2024-1/output_002.jpg
data/splitted_pages-2024-1/output_003.jpg
data/splitted_pages-2024-1/output_004.jpg
data/splitted_pages-2024-1/output_005.jpg
data/splitted_pages-2024-1/output_006.jpg
data/splitted_pages-2024-1/output_007.jpg
data/splitted_pages-2024-1/output_008.jpg
data/splitted_pages-2024-1/output_009.jpg
data/splitted_pages-2024-1/output_010.jpg
data/splitted_pages-2024-1/output_011.jpg
data/splitted_pages-2024-1/output_012.jpg
data/splitted_pages-2024-1/output_013.jpg
data/splitted_pages-2024-1/output_014.jpg
data/splitted_pages-2024-1/output_015.jpg
data/splitted_pages-2024-1/output_016.jpg
data/splitted_pages-2024-1/output_017.jpg
data/splitted_pages-2024-1/output_018.jpg
data/splitted_pages-2024-1/output_019.jpg
data/splitted_pages-2024-1/output_020.jpg
data/splitted_pages-2024-1/output_021.jpg
data/splitted_pages-2024-1/output_022.jpg


In [None]:
cv.imwrite("test.jpg", new_pages[1])

True

In [34]:
pil_images = [Image.fromarray(cv.cvtColor(img, cv.COLOR_BGR2RGB)) for img in new_pages]
pil_images[0].save("output.pdf", save_all=True, append_images=pil_images[1:])

In [374]:
flattened_list = []
for sublist in new_positions:
    if sublist == None:
        flattened_list.append(sublist)
        continue

    for item in sublist:
        flattened_list.append(item)

In [375]:
letters_list = ["A", "B", "C", "D", "E", "E"]

answer_sheet_even = {}
answer_sheet_odd = {}
quest_counter = 1

for item in flattened_list:
    if item is None:
        continue

    if (quest_counter % 2) == 0:
        answer_sheet_even[quest_counter] = letters_list[item.index(0)]
    else:
        answer_sheet_odd[quest_counter] = letters_list[item.index(0)]
    
    quest_counter += 1

In [376]:
df = pd.DataFrame()
df["Answers-1"] = pd.DataFrame([answer_sheet_odd]).T.reset_index(drop=True)
df["Answers-2"] = pd.DataFrame([answer_sheet_even]).T.reset_index(drop=True)
print(df.to_latex())

\begin{tabular}{lrr}
\toprule
 & Answers-1 & Answers-2 \\
\midrule
\bottomrule
\end{tabular}



In [377]:
cv.imwrite("test.jpg", new_pages[0])

True

In [None]:
cv.imwrite("test1.jpg")

True