<a href="https://colab.research.google.com/github/IshuSinghSE/notebook/blob/master/Bloomsplash_Content_Processor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Force reinstall of compatible package versions
!pip install transformers==4.28.0 torch pillow pandas google-generativeai numpy==1.23.5 jax jaxlib

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/110.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.

In [1]:
# --- Step 0: Install Dependencies ---
!pip install transformers==4.28.0 torch pillow pandas google-generativeai



In [4]:
from google.colab import drive
import time

# Force re-mount to ensure file system is synchronized
drive.flush_and_unmount()
print("Unmounted Google Drive.")
drive.mount('/content/drive')
print("Remounted Google Drive.")
# Add a short delay to allow the file system to stabilize
time.sleep(5)

Unmounted Google Drive.
Mounted at /content/drive
Remounted Google Drive.


In [7]:
# --- All Imports ---
import os
import shutil
from datetime import datetime
import pandas as pd
from PIL import Image
import google.generativeai as genai
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import time
import re
from io import BytesIO
from google.colab import userdata

# --- Main Functions ---

def setup_paths_and_dirs(base_path):
    """Defines all necessary paths and creates directories if they don't exist."""
    paths = {
        'base': base_path,
        'new': os.path.join(base_path, 'new'),
        'complete': os.path.join(base_path, 'complete'),
        'backups': os.path.join(base_path, 'backups'),
        'content_csv': os.path.join(base_path, 'content.csv'),
        'enrich_csv': os.path.join(base_path, 'enrich.csv'),
        'master_enrich_csv': os.path.join(base_path, 'master_enrich.csv')
    }

    for dir_path in [paths['new'], paths['complete'], paths['backups']]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            print(f"Created directory: {dir_path}")

    return paths

def backup_data(csv_path, backups_dir):
    """Creates a timestamped backup of the given csv file."""
    if os.path.exists(csv_path) and os.path.getsize(csv_path) > 0:
        print(f"File '{os.path.basename(csv_path)}' found and is not empty. Proceeding with backup.")
        try:
            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            backup_filename = f"{os.path.splitext(os.path.basename(csv_path))[0]}_{timestamp}.csv"
            backup_filepath = os.path.join(backups_dir, backup_filename)
            shutil.copy(csv_path, backup_filepath)
            print(f"Backup of '{os.path.basename(csv_path)}' created at: {backup_filepath}")
        except Exception as e:
            print(f"Could not create backup for {os.path.basename(csv_path)}. Error: {e}")
    else:
        print(f"File '{os.path.basename(csv_path)}' does not exist or is empty. No backup needed.")


def initialize_master_csv(master_csv_path):
    """Initializes the master CSV file with headers if it doesn't exist."""
    if not os.path.exists(master_csv_path):
        print(f"'{os.path.basename(master_csv_path)}' not found. Creating it.")
        pd.DataFrame(columns=['filename', 'caption', 'title', 'description', 'tags', 'category']).to_csv(master_csv_path, index=False)
        print(f"Created 'master_enrich.csv' at: {master_csv_path}")
    else:
        print(f"'{os.path.basename(master_csv_path)}' already exists. Preserving existing file.")


def clear_temp_csvs(content_csv_path, enrich_csv_path):
    """Clears the temporary CSV files and writes headers for the new batch."""
    print("\n--- Clearing temporary CSV files for the new batch... ---")
    # Clear content.csv
    pd.DataFrame(columns=['filename', 'caption']).to_csv(content_csv_path, index=False)
    print(f"Cleared '{os.path.basename(content_csv_path)}'")
    # Clear enrich.csv
    pd.DataFrame(columns=['filename', 'caption', 'title', 'description', 'tags', 'category']).to_csv(enrich_csv_path, index=False)
    print(f"Cleared '{os.path.basename(enrich_csv_path)}'")


def process_images(paths, api_key):
    """The core function to process new images."""

    # --- 1. Identify New Images by comparing with the master CSV ---
    print("\n--- Identifying new images... ---")
    all_files_in_new = os.listdir(paths['new'])
    image_files = sorted([os.path.splitext(f)[0] for f in all_files_in_new if f.lower().endswith(('.jpg', '.jpeg', '.png'))])

    try:
        master_enriched_df = pd.read_csv(paths['master_enrich_csv'])
        processed_files = master_enriched_df['filename'].astype(str).tolist()
    except (FileNotFoundError, pd.errors.EmptyDataError):
        processed_files = []
        print("master_enrich.csv is empty or not found.")

    files_to_process = [f for f in image_files if f not in processed_files]

    if not files_to_process:
        print("No new images to process. Workflow complete.")
        return

    print(f"Found {len(files_to_process)} new images to process: {files_to_process}")

    # --- 2. Clear temporary CSVs for the new batch ---
    clear_temp_csvs(paths['content_csv'], paths['enrich_csv'])

    # --- 3. Setup Models ---
    print("\n--- Setting up models... ---")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

    genai.configure(api_key=api_key)
    gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
    print("--- Models are ready. ---")


    # --- 4. Generate BLIP Captions and save to temporary content.csv ---
    print("\n--- Generating BLIP captions... ---")
    new_captions = []
    for filename in files_to_process:
        original_filename = ""
        for f in all_files_in_new:
            if os.path.splitext(f)[0] == filename:
                original_filename = f
                break
        if not original_filename:
            print(f"  - Could not find original file for: {filename}")
            continue

        image_path = os.path.join(paths['new'], original_filename)
        try:
            raw_image = Image.open(image_path).convert("RGB")
            inputs = blip_processor(images=raw_image, return_tensors="pt").to(device)
            out = blip_model.generate(**inputs, max_new_tokens=50)
            caption = blip_processor.decode(out[0], skip_special_tokens=True)
            new_captions.append({'filename': filename, 'caption': caption})
            print(f"  - Generated caption for: {original_filename}")
        except Exception as e:
            print(f"  - Error processing {original_filename} with BLIP: {e}")

    if new_captions:
        new_captions_df = pd.DataFrame(new_captions)
        new_captions_df.to_csv(paths['content_csv'], index=False)
        print("Successfully saved new captions to temporary 'content.csv'.")


    # --- 5. Enrich with Gemini in Batches and save to temporary enrich.csv ---
    print("\n--- Enriching content with Gemini... ---")
    content_to_enrich_df = pd.DataFrame(new_captions)

    newly_enriched_data = []
    batch_size = 100 # Updated batch size
    for i in range(0, len(content_to_enrich_df), batch_size):
        batch_df = content_to_enrich_df.iloc[i:i+batch_size]
        print(f"--- Processing batch {i//batch_size + 1}... ---")

        prompt_parts = [
            "Generate a professional, two-word title, a plain text description, a comma-separated list of tags, and a category for each of the following image captions.",
            "The output for each should be clearly structured with 'Filename:', 'Title:', 'Description:', 'Tags:', and 'Category:' headings, and must be in plain text without any markdown formatting."
        ]
        for _, row in batch_df.iterrows():
            prompt_parts.append(f"Filename: {row['filename']}\nCaption: {row['caption']}")

        prompt = "\n".join(prompt_parts)
        try:
            response = gemini_model.generate_content(prompt)
            time.sleep(20) # Rate limit delay
            generated_text = response.text
        except Exception as e:
            print(f"An error occurred with the Gemini API: {e}")
            continue

        # Parsing logic...
        item_blocks = re.split(r'\n(?=Filename:)', generated_text.strip())
        for block in item_blocks:
            if not block.strip(): continue
            filename_match = re.search(r"Filename:\s*(.*?)\n", block, re.I | re.S)
            if not filename_match: continue
            filename = filename_match.group(1).strip()
            caption_row = content_to_enrich_df[content_to_enrich_df['filename'] == filename]
            if caption_row.empty: continue
            caption = caption_row.iloc[0]['caption']

            title = (re.search(r"Title:\s*(.*?)\n", block, re.I | re.S).group(1).strip() if re.search(r"Title:", block, re.I) else "N/A")
            desc = (re.search(r"Description:\s*(.*?)\n", block, re.I | re.S).group(1).strip() if re.search(r"Description:", block, re.I) else "N/A")
            tags = (re.search(r"Tags:\s*(.*?)\n", block, re.I | re.S).group(1).strip() if re.search(r"Tags:", block, re.I) else "N/A")
            cat = (re.search(r"Category:\s*(.*)", block, re.I | re.S).group(1).strip() if re.search(r"Category:", block, re.I) else "N/A")

            newly_enriched_data.append({'filename': filename, 'caption': caption, 'title': title, 'description': desc, 'tags': tags, 'category': cat})

    if newly_enriched_data:
        new_enrich_df = pd.DataFrame(newly_enriched_data)
        new_enrich_df.to_csv(paths['enrich_csv'], index=False)
        print("Successfully saved newly enriched data to temporary 'enrich.csv'.")

        # --- 6. Append new data to the master CSV ---
        print("\n--- Appending new data to master_enrich.csv... ---")
        master_df = pd.read_csv(paths['master_enrich_csv'])
        updated_master_df = pd.concat([master_df, new_enrich_df], ignore_index=True)
        updated_master_df.drop_duplicates(subset=['filename'], keep='last', inplace=True)
        updated_master_df.to_csv(paths['master_enrich_csv'], index=False)
        print("Successfully appended new data to 'master_enrich.csv'.")


    # --- 7. Move Processed Files ---
    print("\n--- Moving processed files... ---")
    for filename in files_to_process:
        original_filename = ""
        for f in all_files_in_new:
            if os.path.splitext(f)[0] == filename:
                original_filename = f
                break
        if not original_filename:
            print(f"  - Could not find original file to move for: {filename}")
            continue

        source = os.path.join(paths['new'], original_filename)
        destination = os.path.join(paths['complete'], original_filename)
        shutil.move(source, destination)
        print(f"  - Moved {original_filename}")

    print("\n--- Workflow finished successfully! ---")


# --- Main Execution Block ---
if __name__ == "__main__":
    # --- Configuration ---
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    DRIVE_BASE_PATH = "/content/drive/My Drive/bloomsplash"

    # --- Run the Workflow ---
    all_paths = setup_paths_and_dirs(DRIVE_BASE_PATH)

    # --- Check for new files BEFORE backup and processing ---
    files_in_new_folder = [f for f in os.listdir(all_paths['new']) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

    if not files_in_new_folder:
        print("No image files found in the 'new' folder. Workflow will not run.")
    else:
        print(f"Found {len(files_in_new_folder)} image(s) in the 'new' folder. Starting workflow.")
        # --- Backup Data (only if there are new files) ---
        backup_data(all_paths['content_csv'], all_paths['backups'])
        backup_data(all_paths['enrich_csv'], all_paths['backups'])
        backup_data(all_paths['master_enrich_csv'], all_paths['backups'])

        # --- Initialize Master CSV (only if it doesn't exist) ---
        initialize_master_csv(all_paths['master_enrich_csv'])

        # --- Run the main processing workflow ---
        process_images(all_paths, GEMINI_API_KEY)

No image files found in the 'new' folder. Workflow will not run.
