# Prompt Optimization for Generative Art Models
### Kaustav Ghosh
### Department of Computer Science 
### University of Exeter 
### Exeter, UK
### kg498@exeter.ac.uk

## Data Aggregation

In [None]:
import json
import os
import pandas as pd
from collections import defaultdict
import glob
import re

In [None]:
# Getting the filepaths of all the JSOn files conmtaining our prompt data from Kaggle
filepaths = []
for dirname, _, filenames in os.walk('C:/Users/Kaustav Ghosh/OneDrive/Desktop/AI ART/Data'):
    for filename in filenames:
        filepaths.append(os.path.join(dirname, filename))
print(f"Found {len(filepaths)} files.")

# These are based of the discord iteractions users have with the Midjourney Bot
COMPONENTS_FOR_INITIAL_AND_VARIATION = set(
    ['U1', 'U2', 'U3', 'U4', '⟳', 'V1', 'V2', 'V3', 'V4'])
COMPONENTS_FOR_UPSCALE = set(
    ['Make Variations', 'Upscale to Max', 'Light Upscale Redo'])

In [None]:
def get_message_type(message):
    
    #Figures out the message type based on the UI components displayed.
    for components in message["components"]:
        for component in components["components"]:
            if component["label"] in COMPONENTS_FOR_INITIAL_AND_VARIATION:
                # For (very few) messages that are supposedly initial or variation requests, the content indicates
                # that they are actually upscale requests. We will just put these aside.
                if "Upscaled" in message["content"]:
                    return "INCONCLUSIVE"
                return "INITIAL_OR_VARIATION"
            elif component["label"] in COMPONENTS_FOR_UPSCALE:
                return "UPSCALE"
    return "TEXT_MESSAGE"

In [None]:
def get_prompt(message):
    
    #Extracts the prompt from the message content, which is located between double stars.
    content = message["content"]
    # Replace newlines with spaces; makes the regex below work.
    content = content.replace("\n", " ")
    # Find the text enclosed by two consecutive stars.
    BETWEEN_STARS = "\\*\\*(.*?)\\*\\*"
    match = re.search(BETWEEN_STARS, content)
    if match:
        return match.group()[2:-2]  # Exclude the stars.

In [None]:
def remove_urls(prompt):
    
    #Prompts can include both text and images; this method removes the prompt image URLs.
    URL = "<https[^<]*>?\s"
    matches = re.findall(URL, prompt)
    for match in matches:
        prompt = prompt.replace(match, "")
    return prompt

In [None]:
prompts = []
upscaled = []
df=pd.DataFrame()
for filepath in filepaths:
    with open(filepath, "r") as f:
        content = json.load(f)
        for single_message_list in content["messages"]:
            assert len(single_message_list) == 1
            message = single_message_list[0]
            message_type = get_message_type(message)

            if message_type not in ["INITIAL_OR_VARIATION", "UPSCALE"]:
                continue  # Ignore direct text messages.

            prompt = get_prompt(message)
            if not prompt:
                continue  # Discard malformed messages.
                
            # The goal of this dataset is to learn *text prompts*, so remove any image prompts.
            text_prompt = remove_urls(prompt)
            if message_type in ["INITIAL_OR_VARIATION"]:
                upscaled.append("INITIAL_OR_VARIATION")
            else:
                upscaled.append("UPSCALED")
            prompts.append(text_prompt)
            
print(f"Extracted {len(prompts)} text prompts.")

df["Prompts"]=prompts
df["Type"]= upscaled
df.to_csv("LAKE.csv")

# Storing final output in Lake.csv
df2 = pd.read_csv("C:/Users/Kaustav Ghosh/OneDrive/Desktop/AI ART/LAKE.csv")
rslt_df = df2[df2['Prompts'].isalpha() == True]   

## Data Cleaning
Most of the Data cleaning has been done manually by me by going through the Lake.csv. This honestly was 70% of the total workload as there was lot of data with a lot of errors.

## Machine learning
we have used the cleaned Lake.csv to run in 3 of our models LSTM, RNN and GRU to accurately predicted upscaled chances.

### LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


# Load the dataset from CSV file
dataset = pd.read_csv("C:/Users/Kaustav Ghosh/OneDrive/Desktop/AI ART/LAKE.csv")

# Split the dataset into training and testing sets
X = dataset.iloc[:, 0].values  # Prompts
y = dataset.iloc[:, 1].values  # Updated or not labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the prompts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

# Pad sequences for consistent input length
max_sequence_length = max(len(tokens) for tokens in X_train_tokens)
X_train_padded = pad_sequences(X_train_tokens, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_tokens, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_padded, y_train_encoded, validation_data=(X_test_padded, y_test_encoded), epochs=10, batch_size=32)

# Function to generate an optimized prompt
def generate_optimized_prompt(prompt):
    prompt_tokens = tokenizer.texts_to_sequences([prompt])
    prompt_tokens_padded = pad_sequences(prompt_tokens, maxlen=max_sequence_length, padding='post')
    prediction = model.predict(prompt_tokens_padded)
    prediction_label = label_encoder.inverse_transform([1 if p > 0.5 else 0 for p in prediction])[0]
    return prompt if prediction_label == "Already Optimized" else prompt + "Need Optimization"

# Usage
input_prompt = input("User prompt")
optimized_prompt = generate_optimized_prompt(input_prompt)
print("Optimized prompt:", optimized_prompt)

### RNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Load the dataset from CSV file
dataset = pd.read_csv("C:/Users/Kaustav Ghosh/OneDrive/Desktop/AI ART/LAKE.csv")

# Split the dataset into training and testing sets
X = dataset.iloc[:, 0].values  # Prompts
y = dataset.iloc[:, 1].values  # Updated or not labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the prompts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

# Pad sequences for consistent input length
max_sequence_length = max(len(tokens) for tokens in X_train_tokens)
X_train_padded = pad_sequences(X_train_tokens, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_tokens, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the RNN model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
model.add(SimpleRNN(128))
model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_padded, y_train_encoded, validation_data=(X_test_padded, y_test_encoded), epochs=10, batch_size=32)

# Function to generate an optimized prompt
def generate_optimized_prompt(prompt):
    prompt_tokens = tokenizer.texts_to_sequences([prompt])
    prompt_tokens_padded = pad_sequences(prompt_tokens, maxlen=max_sequence_length, padding='post')
    prediction = model.predict(prompt_tokens_padded)
    prediction_label = label_encoder.inverse_transform([1 if p > 0.5 else 0 for p in prediction])[0]
    return prompt if prediction_label == "Already Optimized" else prompt + "Need Optimization"

# Usage
input_prompt = input("User prompt")
optimized_prompt = generate_optimized_prompt(input_prompt)
print("Optimized prompt:", optimized_prompt)

### GRU

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Load the dataset from CSV file
dataset = pd.read_csv("C:/Users/Kaustav Ghosh/OneDrive/Desktop/AI ART/LAKE.csv")

# Split the dataset into training and testing sets
X = dataset.iloc[:, 0].values  # Prompts
y = dataset.iloc[:, 1].values  # Updated or not labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the prompts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

# Pad sequences for consistent input length
max_sequence_length = max(len(tokens) for tokens in X_train_tokens)
X_train_padded = pad_sequences(X_train_tokens, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_tokens, maxlen=max_sequence_length, padding='post')

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the GRU model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_sequence_length))
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_padded, y_train_encoded, validation_data=(X_test_padded, y_test_encoded), epochs=10, batch_size=32)

# Function to generate an optimized prompt
def generate_optimized_prompt(prompt):
    prompt_tokens = tokenizer.texts_to_sequences([prompt])
    prompt_tokens_padded = pad_sequences(prompt_tokens, maxlen=max_sequence_length, padding='post')
    prediction = model.predict(prompt_tokens_padded)
    prediction_label = label_encoder.inverse_transform([1 if p > 0.5 else 0 for p in prediction])[0]
    return prompt if prediction_label == "Already Optimized" else prompt + "Need Optimization"

# Usage
input_prompt = input("User prompt")
optimized_prompt = generate_optimized_prompt(input_prompt)
print("Optimized prompt:", optimized_prompt)