In [7]:
import os
import ast
import shutil
import pandas as pd
from tqdm import tqdm
from utils import BUCKET
from pathlib import Path
import concurrent.futures
import matplotlib.pyplot as plt

SRC_DIR = Path('/Volumes/external_drive')

In [8]:
def parse_gallery_info(filepath: Path):
    filenames = []
    for f in os.listdir(filepath.parent):
        if not f.startswith(".") and not f.endswith(".txt"):
            filenames.append(f)

    with open(filepath, 'r') as file:
        content = file.readlines()

    info = {}
    spl = filepath.as_posix().split("/")
    info['gallery_category'] = spl[3]
    info['gallery_name'] = spl[4]

    if 'cropped' in filenames:
        filenames.remove('cropped')

    info['filenames'] = filenames

    for line in content:
        key = line.split(":")[0].strip()
        value = line.split(":")[-1].strip()
        info[key.lower()] = value.replace(" Pics", "")

    return info

In [9]:
def process_gallery(category_name: str, gallery: str):
    gallery_info_path = SRC_DIR / category_name / gallery / 'gallery_info.txt'
    if gallery_info_path.exists():
        gallery_info = parse_gallery_info(gallery_info_path)
        return gallery_info

In [10]:
def process_category(category_name: str):
    category_path = SRC_DIR / category_name
    category_galleries = [
        gallery
        for gallery in os.listdir(category_path)
        if (category_path / gallery).is_dir()
    ]

    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        future_to_gallery = [
            executor.submit(process_gallery, category_name, gallery)
            for gallery in category_galleries
        ]

        for future in tqdm(
                concurrent.futures.as_completed(future_to_gallery),
                total=len(future_to_gallery),
                desc=f"Processing {category_name}",
                unit="gallery"
        ):
            gallery_info = future.result()
            if gallery_info:
                results.append(gallery_info)

    return results

In [11]:
def crawl_images_folder():
    categories = [
        category for category in os.listdir(SRC_DIR)
        if (SRC_DIR / category).is_dir() and not category.startswith(".")
    ]

    all_galleries_info = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_category = [
            executor.submit(process_category, category)
            for category in categories
        ]
        for future in tqdm(
                concurrent.futures.as_completed(future_to_category),
                total=len(future_to_category),
                desc="Processing categories", unit="category"
        ):
            category_galleries_info = future.result()
            all_galleries_info.extend(category_galleries_info)

    return all_galleries_info

In [None]:
try:
    raise FileNotFoundError
    df = pd.read_csv('datasets/galleries_dataset.csv')
    df = df[df['categories'].notnull() & df['categories'].apply(lambda x: x != [])]
    df = df[df['categories_suggestions'].notnull() & df['categories_suggestions'].apply(lambda x: x != [])]
    df['filenames'] = df['filenames'].apply(ast.literal_eval)
    df['categories'] = df['categories'].apply(ast.literal_eval)
    df['categories_suggestions'] = df['categories_suggestions'].apply(ast.literal_eval)

except FileNotFoundError:
    all_galleries_info = crawl_images_folder()
    df = pd.DataFrame(all_galleries_info)
    df.drop(['stats', 'tags list'], axis=1, inplace=True)
    non_dupes = df.drop_duplicates(subset=['gallery_name'])
    _to_delete = df[~df.index.isin(non_dupes.index)]

df.shape, _to_delete.shape

In [None]:
if _to_delete.shape[0] > 0:
    for _, row in tqdm(
            _to_delete.iterrows(),
            total=_to_delete.shape[0],
            desc="Deleting galleries",
            unit="gallery"
    ):
        category = row['gallery_category']
        gallery_name = row['gallery_name']
        gallery_path = SRC_DIR / category / gallery_name
        if not gallery_path.exists():
            continue
        print(f"Deleting {gallery_path}")
        shutil.rmtree(gallery_path)
        print("Deleted!")
        if BUCKET.blob(f"pics/{category}/{gallery_name}").exists():
            BUCKET.blob(f"pics/{category}/{gallery_name}").delete()

# High Res filtering DATAFRAME

In [None]:
from PIL import Image


def process_filenames_to_mini_df(row):
    category = row['gallery_category']
    gallery_name = row['gallery_name']
    filenames = row['filenames']

    gallery_dicts = []

    def _process_image(category: str, gallery_name: str, filename: str) -> None:
        file_path = os.path.join(SRC_DIR, category, gallery_name, filename)
        try:
            with Image.open(file_path) as img:
                height, width = img.size
                new_row = row.to_dict()
                new_row['filename'] = filename
                new_row['width'] = width
                new_row['height'] = height
                gallery_dicts.append(new_row)
        except Exception as e:
            print(f"Error processing {file_path}. \nError {e}")
            os.remove(file_path)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(_process_image, category, gallery_name, filename) for filename in filenames
        ]
        for future in concurrent.futures.as_completed(futures):
            future.result()

    return pd.DataFrame(gallery_dicts)

In [None]:
df.iloc[1]

In [None]:
process_filenames_to_mini_df(df.iloc[1])

In [None]:
futures = []

mini_dfs = []
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing galleries", unit="gallery"):
    mini_dfs.append(process_filenames_to_mini_df(row))

In [None]:
enhanced_df = pd.concat(mini_dfs)
enhanced_df['resolution'] = enhanced_df['width'] * enhanced_df['height']
enhanced_df = enhanced_df[enhanced_df['resolution'] > 500 * 500]
enhanced_df.shape

In [None]:
plt.figure(figsize=(15, 15))
plt.scatter(enhanced_df['width'], enhanced_df['height'], alpha=0.5, edgecolors='w', linewidth=0.5)
plt.title('Distribution of Image Sizes')
plt.xlabel('Width')
plt.ylabel('Height')
plt.grid(True)
plt.show()

In [None]:
# enhanced_df.drop(['filenames', 'channel', 'tags', 'resolution'], axis=1, inplace=True)
enhanced_df = enhanced_df[
    enhanced_df['categories'].notnull()
    & enhanced_df['categories'].apply(lambda x: x != [])
    ]
enhanced_df = enhanced_df[
    enhanced_df['categories_suggestions'].notnull()
    & enhanced_df['categories_suggestions'].apply(lambda x: x != [])
    ]
enhanced_df.shape


In [None]:
enhanced_df.to_csv('datasets/images_high_res_dataset.csv', index=False)