**Setup**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets

# **NO NEED TO RUN AGAIN UNLESS WE WANT TO GENERATE ADDITIONAL 50K PROMPTS!!!**

In [None]:
from datasets import load_dataset

# Define the dataset name and specific .tar file
dataset_name = "jackyhate/text-to-image-2M"
target_tar_name = "data_000001.tar"  # Specific .tar file

# Build the path for the .tar file
tar_file_path = f"hf://datasets/{dataset_name}/data_512_2M/{target_tar_name}"

# Load the dataset with the specific .tar file
dataset = load_dataset(dataset_name, data_files={"train": [tar_file_path]}, split="train", streaming=True)

# Function to process the dataset and extract prompts
def extract_prompts(dataset, num_samples=10):
    prompts = []
    for idx, example in enumerate(dataset):
        prompt = example['json']['prompt']  # Access the prompt from the 'json' field
        prompts.append(prompt)

    return prompts

# Extract and print the first 10 prompts
prompts = extract_prompts(dataset)
print(len(prompts))
print(prompts)


In [None]:
import pandas as pd

# Convert the list of prompts into a DataFrame
df = pd.DataFrame(prompts, columns=["Prompt"])

# Save the DataFrame to an Excel file
file_path = "/content/drive/MyDrive/text-2-image-2M-prompts.xlsx"
df.to_excel(file_path, index=False)

print(f"Excel file saved at: {file_path}")


# **Generate simplified prompts**

**Openai Setup**

In [None]:
!pip install openai

In [None]:
prompt = """I have a caption and I want to make it slightly simpler (w.r.t background and colors) and specific, and turn it a caption that describes the same thing
If the caption is not appropriate to be converted, respond with “NO”. For example:
1. "A black smartphone case with a textured surface is attached to a black phone mount. The mount has a circular base with a screw hole and a rectangular bracket with two screws. The case is positioned at an angle, showing the top and side of the case." → "A smartphone case is attached to a phone mount"
2. "A collection of colorful, sparkly Easter eggs is displayed on a white surface. The eggs come in various colors such as pink, yellow, blue, green, and purple, and are adorned with glitter. They are arranged in a way that showcases their vibrant colors and sparkles." → "A collection of Easter eggs is displayed on a surface"
3. "A black and white drawing of a cartoon panda bear with a red bow tie and a red and white striped hat, holding a gift box with a bow on it. The panda bear is standing with its arms raised and a joyful expression." → "A panda bear with a bow tie and a striped hat, holding a gift box"
4. "A black and white photograph of a young boy with short hair, wearing a white shirt, leaning on a stone ledge with his hand on his chin. In the background, there is a large, ornate building with multiple spires and a dome. The image has a vintage feel, with a focus on the contrast between the boy's youthful appearance and the historical architecture" → "A young boy with short hair, wearing a shirt, leaning on a ledge"
5. "A cartoon character of Tom and Jerry is depicted with Tom on the left and Jerry on the right. Tom is a large grey cat with a wide smile and a red background behind him. Jerry is a small brown mouse with a wide smile and a red background behind him. Both characters have exaggerated features and are set against a red background with the words 'Tom and Jerry' written in white at the bottom" → "A cartoon character of Tom and Jerry"
6. "A dog rolling in the snow at sunset" → "A dog rolling in the snow"
7. "pink photo of Tokyo" → "buildings in Tokyo"
8. "Anti-fracking protest rocks NY governor's state of the state address" → "group of people protesting in front of a house"
9. "st peter's square: St Peters Square in Rome Italy" → "a square with the colosseum in the background"
10. "A Queen Elizabeth II Prince Andrew On The Balcony Of Buckingham Palace After Trooping The Colour Ceremony. 1962." → "A Queen on a balcony"
11. "Two little Chihuahua puppies for sale" → "Two little Chihuahua puppies"
12. "Evaluate and Adjust Your Safeguards Evaluate and adjust safeguards and practices in light of results of: System testing and monitoring. Material chang" → "NO"
13. "how-to-watch-the-kitten-bowl" → "A kitten drinking out of a bowl"
14. "Voters head to polls for municipal elections across Ontario" → "A person walking and there is a sign that says vote here"
15. "44 Romantic Barn Wedding Lights Ideas" → "A man and woman dancing at a wedding"
16. "martini drink isolated on white flat vector image" → "A martini drink on white background"
17. "Selway High-Back Executive Chair" → "A high-back chair"
18. "Nickelodeon Paw Patrol'Calling All Pups' Soft Potty Seat" → "NO"
19. "Young people skateboarding on city streets" → "Young skateboarders with a building in the background"
20. "Alternate view of the Sterling Silver Chihuahua Bead Charm by The Black Bow Jewelry Co." → "NO"
21. "A woman cutting a large white sheet cake" → "A woman cutting a large cake."



do this for the following caption: "{context}".

Format your response as
```
Step 1: <simplified>
```
"""

In [None]:
import openai

token = ""
client = openai.OpenAI(api_key=token)

def generate(content):
    messages=[
          {
              "role": "user",
              "content": [
                  {"type": "text", "text": content},
                  {
                      "type": "image_url",
                      "image_url": {
                          "url": "https://i.postimg.cc/QCnfP8xh/image-1.jpg",
                      }
                  },
              ],
          }
      ]
    print("running generate")
    model = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=1,
        messages=messages,
        max_tokens=1000,
    )
    new_thought = model.choices[0].message.content

    return new_thought

**Generate simplified prompts and save to excel**

In [None]:
import pandas as pd
import time
import random

# Read the Excel file into a DataFrame
file_path = "/content/drive/MyDrive/text-2-image-2M-prompts.xlsx"
df = pd.read_excel(file_path)

# Ensure the "Prompt" column exists
if "Prompt" in df.columns:
    simplified_prompts = []

    # Iterate over the first 400 prompts
    for i, _prompt in enumerate(df["Prompt"][5600:6300]):
        # Generate simplified prompt using the generate function
        try:
          simplified_prompt = generate(prompt.format(context=_prompt))  # Call generate function

          # Print the original and simplified prompts
          print(f"Original: {_prompt}")
          print(f"Simplified: {simplified_prompt}")

          # Append the original and simplified prompts to a list
          simplified_prompts.append({"Original Prompt": _prompt, "Simplified Prompt": simplified_prompt})
        except:
          time.sleep(10)
          simplified_prompt = generate(prompt.format(context=_prompt))  # Call generate function

          # Print the original and simplified prompts
          print(f"Original: {_prompt}")
          print(f"Simplified: {simplified_prompt}")

          # Append the original and simplified prompts to a list
          simplified_prompts.append({"Original Prompt": _prompt, "Simplified Prompt": simplified_prompt})


    # Create a DataFrame from the list of simplified prompts
    simplified_df = pd.DataFrame(simplified_prompts)

    # Save the DataFrame to an Excel file
    simplified_file_path = "/content/drive/MyDrive/text-2-image-2M-prompts/simplified/5600-6300.xlsx"
    simplified_df.to_excel(simplified_file_path, index=False)

    print(f"Original and simplified prompts saved to: {simplified_file_path}")
else:
    print("The 'Prompt' column was not found in the Excel file.")


**Clean result prompts and save this as new column in the excel**

In [None]:
import pandas as pd

# Read the Excel file
file_path = '/content/drive/MyDrive/text-2-image-2M-prompts/simplified/5600-6300.xlsx'

df = pd.read_excel(file_path)

# Define the function to clean the simplified prompt
def clean_simplified_prompt(prompt):
    # Extract substring from index 12 to index -4
    cleaned_prompt = prompt[12:-4]

    # Remove the trailing period if it exists
    if cleaned_prompt.endswith('.'):
        cleaned_prompt = cleaned_prompt[:-1]

    return cleaned_prompt

# Apply the cleaning function to the "Simplified Prompt" column
df['Cleaned Simplified Prompt'] = df['Simplified Prompt'].apply(clean_simplified_prompt)

# Write the updated DataFrame back to the same file
df.to_excel(file_path, index=False)

print(f"Cleaned simplified prompts have been added to {file_path}")
