In [1]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import pymongo

from tqdm import tqdm

from IPython.display import  HTML

import cv2
import matplotlib.pyplot as plt
from weasyprint import CSS

In [2]:
load_dotenv("../.env")

True

In [None]:
atlas_user_prd = os.getenv("ATLAS_USER")
atlas_password_prd = os.getenv("ATLAS_PASSWORD")
atlas_url_prd = os.getenv("ATLAS_URL")
atlas_cluster_prd = os.getenv("ATLAS_CLUSTER")

atlas_url = f"mongodb+srv://{atlas_user_prd}:{atlas_password_prd}@{atlas_url_prd}/?retryWrites=true&w=majority&appName={atlas_cluster_prd}"
atlas_client = pymongo.MongoClient(atlas_url)

atlas_client.admin.command("ping")

In [None]:
atlas_db = atlas_client["question-manager-v3"]

In [None]:
def get_all_references(atlas_db, bu_name):
    atlas_col = atlas_db[bu_name]
    if bu_name == "ARCO":
        match_stage = {
            "$match":{
                "fromBusinessUnitInitials":{"$in":[bu_name, None]}
            }
        }
    else:
        match_stage = {
            "$match":{
                "businessUnitInitials":{"$nin":["ARCO"]},
                "fromBusinessUnitInitials":{"$in":[bu_name, None]}
            }
        }
    
    references_agg = atlas_col.aggregate([
        match_stage,
        {
            "$group":{
                "_id": {
                    "reference": "$_id"
                }
            }
        }
    ])
    
    references_list = list(set([agg["_id"]["reference"] for agg in references_agg]))
    
    return references_list


def pixels_should_be_conserved(pixels, threshold=0, max_black_ratio=0.999) -> bool:
    black_pixel_count = (pixels <= threshold).all(axis=1).sum()
    pixel_count = len(pixels)
    
    return pixel_count > 0 and black_pixel_count/pixel_count <= max_black_ratio
    
def edge_removal(image):
    num_rows, num_columns, _ = image.shape
    preserved_rows    = [r for r in range(num_rows)    if pixels_should_be_conserved(image[r, :, :])]
    preserved_columns = [c for c in range(num_columns) if pixels_should_be_conserved(image[:, c, :])]
    image = image[preserved_rows,:,:]
    image = image[:,preserved_columns,:]
    
    return image

def cut_white(image):
    idx_to_cut = -1
    for i in range(image.shape[0]):
        if np.min(image[-i,:,0]) != 255:
            idx_to_cut = image.shape[0] - i
            break
            
    return idx_to_cut


def print_question(reference):
    image = cv2.imread(f"data/questions_to_image/{reference}.png")
    if image:
        plt.imshow(image)
        plt.show()
    
    return None

In [None]:
projection = {
    "_id":0,
    "questionId":1,
    "reference":1,
    "businessUnitInitials":1,
    "visibility":1,
    "text":1,
    "options":1,
    "commentary":1,
    "resolution":1,
    "tip":1,
    "segment":1,
    "segmentValue":1,
    "schoolSubject":1,
    "schoolSubjectValue":1,
    "dimension":1,
    "dimensionValue":1,
    "subject":1,
    "subjectValue":1,
    "subSubject":1,
    "subSubjectValue":1,
    "examBoard":1,
    "examBoardValue":1,
    "difficultyLevel":1,
    "type":1
}


mongo_data = []
for bu_name in ["SAS"]:
    print(f"Collection Name: {bu_name}")

    bu_refs = get_all_references(atlas_db, bu_name)
    chunk = 1000
    total_ids = len(bu_refs)
    atlas_col = atlas_db[bu_name]
    for i in tqdm(range(int(np.ceil(total_ids/chunk)))):
        mongodb_data_fetched = [
            data_fetched for data_fetched
            in atlas_col.find(
                {"_id": {"$in": bu_refs[i*chunk:(i+1)*chunk]}}, 
                projection
            )
        ]
        
        mongo_data += mongodb_data_fetched
        
data = pd.DataFrame(mongo_data)

In [None]:
stem_df = data[data["schoolSubjectValue"].isin(["Matemática","Química","Física"])].copy().reset_index(drop=True)
stem_df = stem_df[(stem_df["text"].str.contains("img src"))]
stem_df = stem_df.replace(np.nan, None)

stem_df["options"] = stem_df["options"].apply(lambda options:[option["text"] for option in options] if options else None)
stem_df = stem_df[stem_df["segment"]==4].reset_index(drop=True)

In [None]:
sample_df = stem_df[["reference","text","options","schoolSubjectValue"]].sample(200, replace=False, random_state=0)
sample_df2 = stem_df[
    ~stem_df["reference"].isin(sample_df["reference"])
].copy().sample(200, replace=False, random_state=0)[["reference","text","options","schoolSubjectValue"]]

In [None]:
css = 'body {background: white;} ' #img { width: 500px;}'
css = CSS(string=css)
value_ans = ["A)","B)","C)","D)","E)","F)","G)","H)"]
for reference, text, options, schoolSubject in tqdm(sample_df.values):
    if options is not None and len(options) != 0:
        options_string = ""
        for value, option in enumerate(options):
            p_split = option.split(">",1)
            if len(p_split) > 1:
                options_string = options_string + p_split[0] + f"> {value_ans[value]} " + p_split[1]
            else:
                options_string = options_string + option
        html = text + options_string
    else:
        html = text
    
    html = HTML(string=html)
    html.write_png("data/output.png", stylesheets=[css], resolution=128)
    
    question_image = cv2.imread("data/output.png", 1)
    question_image = edge_removal(question_image)
    idx_to_cut = cut_white(question_image)
    question_image = question_image[:idx_to_cut+20,:,:]
    
    cv2.imwrite(f"data/questions_to_image/{schoolSubject}_{reference}.png", question_image)