In [2]:
import markdown
import re
import pandas as pd

# Function to convert headings to scene types based on the provided structure
def section_to_scene_type(section):
    section = section.strip()
    if section.startswith('1.1') or section.startswith('1.2'):
        return 'nat'  # Natural scenes
    elif section.startswith('1.3'):
        return 'urban'  # Urban scenes
    elif section.startswith('1.4'):
        return 'flat'  # Flat scenes
    return None

# Convert markdown to Excel based on section types
def convert_md_to_excel_with_outline(markdown_file, excel_file):
    # Read the markdown file
    with open(markdown_file, 'r') as file:
        md_content = file.read()

    # Define section mapping based on the outline structure
    section_mapping = {
        "1.1": "Detailed Natural Scene descriptions",
        "1.2": "Short Natural Scene descriptions",
        "1.3": "Short Urban Scene descriptions",
        "1.4": "Short Flat Scene descriptions",
    }

    # Split the markdown content into sections using regex to capture the section numbers and descriptions
    sections = re.split(r'(1\.\d\s[^\n]+)', md_content)

    # Define the model-device mapping
    model_device_mapping = {
        'dalle2': 'G02',
        'dalle3': 'G03',
        'gpt3.5-turbo-0125': 'G04',
        'gpt4-turbo': 'G05'
    }

    # Prepare data for the Excel file
    rows = []
    current_scene_type = None
    prompt_counter = 1

    # Process each section and classify prompts accordingly
    for i in range(1, len(sections), 2):  # Step through the split sections
        section_heading = sections[i].strip()  # Get the section heading (1.1, 1.2, 1.3, etc.)
        current_scene_type = section_to_scene_type(section_heading)

        if current_scene_type:
            # Extract the prompts in the corresponding section
            prompts = re.findall(r'\d+\.\s+(.+)', sections[i + 1])  # Extract prompts

            for prompt in prompts:
                for model_name, device_code in model_device_mapping.items():
                    # File name format: GXX_I_nat_XXXX
                    file_name = f"{device_code}_I_{current_scene_type}_{prompt_counter:04d}"
                    rows.append({
                        'model': model_name,
                        'file_name': file_name,
                        'prompt': prompt,
                        'nat/flat': current_scene_type
                    })
                prompt_counter += 1

    # Create a DataFrame
    df = pd.DataFrame(rows)

    # Write to Excel
    df.to_excel(excel_file, index=False)
    print(f"Excel file '{excel_file}' created successfully.")

# Convert markdown to Excel using the provided outline structure
markdown_file = '../datasets/image_descriptions.md'  # Path to your markdown file
excel_file = '../datasets/image_prompts.xlsx'  # Output Excel file
convert_md_to_excel_with_outline(markdown_file, excel_file)


Excel file '../datasets/image_prompts.xlsx' created successfully.


## Prompt reduction

In [1]:
import openai
import pandas as pd
import tiktoken  # Tokenizer library for counting tokens
from tqdm import tqdm
import os
from dotenv import load_dotenv


# Load environment variables from .env file
load_dotenv()

# Set up your OpenAI API key from environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')

# Load the OpenAI tokenizer model for GPT-4
tokenizer = tiktoken.encoding_for_model("gpt-4")  # Adjust model as needed

# Function to truncate prompt to 300 characters for DALL·E-2
def truncate_to_300(prompt):
    return prompt[:300]  # Truncate the prompt to the first 300 characters

# Function to count tokens and truncate prompt if it exceeds the max token limit (3000 for DALL·E-3)
def truncate_to_3000_tokens(prompt, max_tokens=3000):
    tokens = tokenizer.encode(prompt)
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        truncated_prompt = tokenizer.decode(truncated_tokens)
        return truncated_prompt
    return prompt

# Function to use GPT-4 to compress the prompt while retaining meaning
def compress_prompt_with_gpt4(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Please reduce the following prompt to 3000 tokens while keeping its meaning: {prompt}"}
            ]
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        print(f"Error using GPT-4 for prompt reduction: {e}")
        return prompt  # If there is an error, return the original prompt

# Main function to process the Excel file, reduce prompt sizes, and save in new columns
def update_excel_with_reduced_prompts(excel_file):
    # Read the Excel file
    df = pd.read_excel(excel_file)

    # Create new columns for truncated prompts
    prompt_300 = []
    prompt_3k = []

    # Iterate through each prompt in the DataFrame
    for prompt in tqdm(df['prompt'], desc="Processing prompts"):
        # Truncate to 300 characters for DALL·E-2
        prompt_300.append(truncate_to_300(prompt))
        
        # For DALL·E-3, compress or truncate prompt to 3000 tokens
        if len(tokenizer.encode(prompt)) > 3000:
            compressed_prompt = compress_prompt_with_gpt4(prompt)
            prompt_3k.append(truncate_to_3000_tokens(compressed_prompt))
        else:
            prompt_3k.append(truncate_to_3000_tokens(prompt))

    # Add new columns to the DataFrame
    df['prompt_300'] = prompt_300
    df['prompt_3k'] = prompt_3k

    # Save the updated DataFrame back to the Excel file
    df.to_excel(excel_file, index=False)
    print(f"Updated prompts saved to {excel_file}")

# Path to your Excel file
excel_file = '../datasets/image_prompts.xlsx'

# Update the Excel file with reduced prompts
update_excel_with_reduced_prompts(excel_file)


Processing prompts: 100%|██████████| 396/396 [00:00<00:00, 20842.31it/s]

Updated prompts saved to ../datasets/image_prompts.xlsx



