## Import packages and objects

In [6]:
import os
from skimage import io
import cv2
from skimage.filters import threshold_otsu, sobel
from skimage import img_as_ubyte
from skimage.color import rgb2gray
import numpy as np
import matplotlib as plt
from PIL import Image
from pytesseract import pytesseract
import re
import json

# Location of scraped images
image_directory = "C:\\Users\\KevinsAcer\\OneDrive - North Carolina State University\\CrosswordProject\\Images\\Connections"

# Location of grayscaled images
connections_gray_dir = "C:\\Users\\KevinsAcer\\OneDrive - North Carolina State University\\CrosswordProject\\Images\\Connections_gray"

# Location of Tesseract directory (required for PyTesseract call to work)
tesseract_directory = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
os.makedirs(image_directory, exist_ok=True)


## Identify the images that are solutions

The data mining pulled 2-3 images per day, only 1 of which is the final answer. A visual check suggested that all the final answer images had a width-to-height ratio of greater than 2 while the other images all had a width-to-height ration of less than 2. 

The following code block calculates the width-to-height ratio of all 1,235 images and, if their ratio is > 2, adds the name of the file to a list called "answers".

This resulted in 419 final answer images out of 428 days in the date range. I visually inspected ~20 images from the answers list and they were all images of actual answers. 

In [2]:
## filter for the images of solutions
### seems like there's sufficient crossover for prevent a simple filter on width from working
### try filtering on ratio of width:height > 2

# Create empty list of answers
answers = []

# Filter the images by ratio of width-to-height
for file in os.listdir(image_directory):
    file_path = os.path.join(image_directory, file)
    image = io.imread(file_path)
    height, width = image.shape[:2]
    ratio = width / height
    if ratio > 2:
        answers.append(file)    

## Function for looping through images to extract categories and answers into a nested dictionary

Intial testing of text extraction suggested that grayscale and thresholding were necessary to capture all the categories. This is likely due to the 4 different colors of the backgrounds of the 4 categories (i.e. purple, blue, green, yellow).

Based on the planned text analysis, I elected to build a nested dictionary as follows:
- key for each date (e.g. "2023-06-18")
- value that is a sub-dictionary
    * the keys of the sub-dictionary are the categories of answers for each date (e.g. "municipalities")
    * the values of the sub-dictionary are lists containing the answers (e.g. ["city", "county", "town", "village"])

In [8]:
# Initialize output dictionary and error list
result_dict = {}
error_list = []

# Provide tesseract.exe location to pytesseract library
pytesseract.tesseract_cmd = tesseract_directory

# Function to process each image
def process_image(image_name):
    try:
        # Step 1: Load the image
        path_temp = os.path.join(image_directory, image_name)
        temp_image = Image.open(path_temp)

        # Step 2: Convert to grayscale
        temp_imageGray = temp_image.convert('L')
        gray_image_path = os.path.join(connections_gray_dir, f"{image_name}_gray.png")
        temp_imageGray.save(gray_image_path)

        # Step 3: Load the grayscale image with OpenCV and apply thresholding
        temp_image2 = cv2.imread(gray_image_path, cv2.IMREAD_GRAYSCALE)
        _, temp_image2Thres = cv2.threshold(temp_image2, 50, 255, cv2.THRESH_BINARY)

        # Step 4: Extract text using PyTesseract with thresholded image
        tess_settings = '--psm 1 --oem 1'
        text_tempThresh = pytesseract.image_to_string(temp_image2Thres, config=tess_settings)

        # Step 5: Parse the extracted text into categories and words
        lines = text_tempThresh.split("\n")
        current_category = None
        parsed_data = {}
        for line in lines:
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            if re.match(r"^[A-Z\s]+$", line):  # Matches uppercase letters and spaces only
                current_category = line.lower()  # Convert to lowercase
                parsed_data[current_category] = []
            elif current_category:
                words = [word.strip().lower() for word in line.split(",")]
                parsed_data[current_category].extend(words)

        return parsed_data

    except Exception as e:
        print(f"Error processing {image_name}: {e}")
        return None

# Process all images in the list
for image_name in answers:
    date = image_name.split("_")[0]  # Extract the date from the file name
    parsed_data = process_image(image_name)
    if parsed_data:
        result_dict[date] = parsed_data
    else:
        error_list.append(image_name)

# Output the results
print("Result Dictionary:")
print(result_dict)
print("\nError List:")
print(error_list)

Result Dictionary:

Error List:
['2023-10-25_3.png', '2024-01-13_3.png', '2024-05-17_2.png', '2024-06-24_3.png', '2024-07-08_3.png', '2024-08-19_3.png', '2024-08-24_3.png', '2024-10-16_3.png']


### Save results_dict to a text file for later analysis

In [None]:

with open('result_dictionary.txt', 'w') as convert_file: 
     convert_file.write(json.dumps(result_dict))