# Filter images

Flag which files from the metadata dataset should not be included in datasets

## Config

In [29]:
import sqlalchemy as sqa
import pandas as pd
import numpy as np
import logging
from dataclasses import dataclass, field
from typing import Set, Dict, Callable, Any
import pytesseract
from collections import Counter

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.propagate = False

In [24]:
# Database parameters
db_container = "metadata_db"
db_user = "pguser"
db_password = "pgpassword"
db_port = 5432
db_database = "metadata"
db_prefix = "postgresql"
metadata_tbl = "base_images"

In [25]:
db_con_str = f"{db_prefix}://{db_user}:{db_password}@{db_container}:{db_port}/{db_database}"
db_engine = sqa.create_engine(db_con_str)

In [26]:
with db_engine.connect() as con:
    df_img = pd.read_sql_table(metadata_tbl, con, index_col="image_name")

df_img.head()

Unnamed: 0_level_0,file_name,download_loc,final_loc,full_path,search_term,source,read,orig_width,orig_height,width,height,label,label_str,download_name,filtered,filter_reason
image_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
poison_ivy_plant_1095,poison_ivy_plant_1095.jpg,simple_images/poison ivy plant/poison ivy plan...,../datasets/pipeline_v1/downloaded_images/posi...,/home/code/datasets/pipeline_v1/downloaded_ima...,poison ivy plant,Google Images,True,600,900,500,750,1,positive,poison ivy plant_1095.jpg,False,
poison_ivy_plant_1556,poison_ivy_plant_1556.jpg,simple_images/poison ivy plant/poison ivy plan...,../datasets/pipeline_v1/downloaded_images/posi...,/home/code/datasets/pipeline_v1/downloaded_ima...,poison ivy plant,Google Images,True,713,376,713,376,1,positive,poison ivy plant_1556.jpg,False,
poison_ivy_plant_1294,poison_ivy_plant_1294.jpg,simple_images/poison ivy plant/poison ivy plan...,../datasets/pipeline_v1/downloaded_images/posi...,/home/code/datasets/pipeline_v1/downloaded_ima...,poison ivy plant,Google Images,True,600,450,600,450,1,positive,poison ivy plant_1294.jpg,False,
poison_ivy_plant_964,poison_ivy_plant_964.jpg,simple_images/poison ivy plant/poison ivy plan...,../datasets/pipeline_v1/downloaded_images/posi...,/home/code/datasets/pipeline_v1/downloaded_ima...,poison ivy plant,Google Images,True,750,450,750,450,1,positive,poison ivy plant_964.jpg,False,
poison_ivy_plant_1526,poison_ivy_plant_1526.jpg,simple_images/poison ivy plant/poison ivy plan...,../datasets/pipeline_v1/downloaded_images/posi...,/home/code/datasets/pipeline_v1/downloaded_ima...,poison ivy plant,Google Images,True,822,462,822,462,1,positive,poison ivy plant_1526.jpg,False,


## Define filters

In [27]:
@dataclass(frozen=False)
class Image_Filter:
    name: str
    filter_fn: Callable[..., bool]   # Function which takes an image (as an ndarray) and returns whether it passes filter or not
    applicable_img_names: Set[str] = field(default_factory=set)  # Which images to apply filter tp
    filter_params: Dict[str, Any] = field(default_factory=dict)
    
    # def apply_filter(self, img: np.ndarray) -> bool:
    #     return(self.filter_fn(img, **self.filter_params))

In [28]:
# Define filter configs
google_images = set(df_img.loc[df_img['source'] == 'Google Images'].index)


def has_text_filter_fn(img: np.ndarray) -> bool:
    """
    Returns true (passes filter) pytesseract dectected text is the empty string, after stripping all whitespace
    """
    return(pytesseract.image_to_string(img, lang='eng').strip() == '')

def max_color_appearence(img: np.ndarray) -> float:
    """
    Count the fraction of pixels which make up the most common color
    Detects if there is one color appearing too much (i.e. if there is a solid background instead of a more natural looking photo)
    """
    width, height, _ = img.shape
    return(max(Counter(tuple(img[i,j,:])
                       for i in range(width)
                       for j in range(height)).values()) / (height*width))
    
    
def max_color_appearence_filter_fn(img: np.ndarray, threshold_frac: float = 0.1) -> bool:
    """
    Returns true (passes filter) if the mode color makes up less than the specified fraction of image pixels
    i.e. a failure might be an image with a solid colored background
    """
    return(max_color_appearence(img) <= threshold_frac)



# Create filter objects
has_text_filter = Image_Filter(name='Contains text',
                               filter_fn=has_text_filter_fn,
                               applicable_img_names=google_images)

max_color_filter = Image_Filter(name='Max color appearence',
                                filter_fn=max_color_appearence_filter_fn,
                                applicable_img_names=google_images,
                                filter_params={'threshold_frac': 0.1})
