# Load metadata from Diffusion DB

In [None]:
from urllib.request import urlretrieve
import pandas as pd

# Download the parquet table
table_url = f'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata.parquet'
urlretrieve(table_url, 'metadata.parquet')

# Read the table using Pandas
metadata_df = pd.read_parquet('metadata.parquet')

# Extract theme words of prompts: Attempts (deprecated)

## Attempt: T5 (deprecated)

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm

def generate_simple_prompt(complex_prompt):
    input_text = f"summarize: {complex_prompt}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=5, min_length=1, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
tqdm.pandas(desc="Processing Prompts")

new_df = metadata_df.loc[:99, ['prompt']]
complex_prompt = new_df['prompt']

In [None]:
new_df['simple_prompt'] = complex_prompt.progress_apply(generate_simple_prompt)

Processing Prompts: 100%|██████████| 100/100 [01:14<00:00,  1.34it/s]


In [None]:
pd.set_option('display.max_colwidth', None)
print(new_df[['prompt', 'simple_prompt']].head())

                                                                                                                                                                                   prompt  \
0                                                                        a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation    
1  a portrait of a female robot made from a cloud of images being very grateful to the creator, very intricate details, futuristic steampunk, octane render, 8 k, trending on artstation    
2                                                                                                                                           only memories remain, trending on artstation    
3                                                                                                                                                        dream swimming pool with nobody    
4                                                      

## Attempt: RAKE (deprecated)

In [None]:
!pip install rake_nltk

Collecting rake_nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake_nltk
Successfully installed rake_nltk-1.0.6


In [None]:
import pandas as pd
from rake_nltk import Rake
import nltk
nltk.download('stopwords')
nltk.download('punkt')

rake = Rake()

max_keywords = 3
min_keywords = 1

simple_prompts = []

new_df = metadata_df.loc[:99, ['prompt']]
for prompt in new_df["prompt"]:
    rake.extract_keywords_from_text(prompt)
    keywords = rake.get_ranked_phrases()

    selected_keywords = keywords[:max(max_keywords, min_keywords)]
    if len(selected_keywords) < min_keywords:
        selected_keywords.extend([selected_keywords[-1]] * (min_keywords - len(selected_keywords)))

    simple_prompt = ", ".join(selected_keywords)
    simple_prompts.append(simple_prompt)

new_df["simple_prompt"] = simple_prompts

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
pd.set_option('display.max_colwidth', None)
print(new_df[['prompt', 'simple_prompt']].head())

                                                                                                                                                                                   prompt  \
0                                                                        a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation    
1  a portrait of a female robot made from a cloud of images being very grateful to the creator, very intricate details, futuristic steampunk, octane render, 8 k, trending on artstation    
2                                                                                                                                           only memories remain, trending on artstation    
3                                                                                                                                                        dream swimming pool with nobody    
4                                                      

## Attempt: Spacy (deprecated)

In [None]:
import spacy
import pandas as pd


nlp = spacy.load("en_core_web_sm")

new_df = metadata_df.loc[:99, ['prompt']]

def extract_nouns_with_adj(text):
    doc = nlp(text)
    noun_adj_pairs = []
    for token in doc:
        if token.pos_ == "NOUN":
            adj_modifiers = [child.text for child in token.children if child.pos_ == "ADJ"]
            if adj_modifiers:
                pair = " ".join(adj_modifiers + [token.text])
                noun_adj_pairs.append(pair)
            else:
                noun_adj_pairs.append(token.text)
    return ", ".join(noun_adj_pairs)

new_df["simple_prompt"] = new_df["prompt"].apply(extract_nouns_with_adj)

In [None]:
pd.set_option('display.max_colwidth', None)
print(new_df[['prompt', 'simple_prompt']].head())

                                                                                                                                                                                   prompt  \
0                                                                        a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation    
1  a portrait of a female robot made from a cloud of images being very grateful to the creator, very intricate details, futuristic steampunk, octane render, 8 k, trending on artstation    
2                                                                                                                                           only memories remain, trending on artstation    
3                                                                                                                                                        dream swimming pool with nobody    
4                                                      

## Attempt: LDA (deprecated)

In [None]:
import pandas as pd
from gensim import corpora, models
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

new_df = metadata_df.loc[:999, ['prompt']]

stop_words = stopwords.words('english')
texts = [[word for word in prompt.lower().split() if word not in stop_words] for prompt in new_df['prompt']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda = models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


(0, '0.014*"high" + 0.010*"8" + 0.010*"realistic" + 0.009*"detailed," + 0.009*"concept"')
(1, '0.014*"beautiful" + 0.013*"detailed," + 0.013*"highly" + 0.012*"greg" + 0.010*"trending"')
(2, '0.013*"art" + 0.012*"3" + 0.012*"concept" + 0.012*"portrait" + 0.011*"detailed"')


## Attempt: YAKE+LDA (deprecated)

In [None]:
!pip install yake

Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting jellyfish (from yake)
  Downloading jellyfish-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: segtok, jellyfish, yake
Successfully installed jellyfish-1.0.3 segtok-1.5.11 yake-0.4.8


In [None]:
import pandas as pd
from yake import KeywordExtractor
from gensim import corpora, models

new_df = metadata_df.loc[:999, ['prompt']]

extractor = KeywordExtractor(n=3, stopwords=None)
keywords = [extractor.extract_keywords(prompt) for prompt in new_df["prompt"]]

texts = [[word for word, score in kw] for kw in keywords]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda = models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.008*"highly detailed" + 0.006*"greg rutkowski" + 0.006*"digital painting" + 0.006*"trending on artstation" + 0.005*"portrait"')
(1, '0.010*"concept art" + 0.010*"artstation" + 0.007*"portrait" + 0.006*"detailed" + 0.006*"painting"')
(2, '0.010*"highly detailed" + 0.007*"greg rutkowski" + 0.007*"beautiful painting" + 0.007*"simon stalenhag" + 0.007*"pascal blanche"')


## Attempt: RAKE+spacy V1 (deprecated)

In [None]:
!pip install rake_nltk

Collecting rake_nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake_nltk
Successfully installed rake_nltk-1.0.6


In [None]:
import pandas as pd
from rake_nltk import Rake
import nltk
import spacy

nltk.download('stopwords')
nltk.download('punkt')

rake = Rake()
nlp = spacy.load("en_core_web_sm")

max_keywords = 3
min_keywords = 1

def filter_nouns_and_adj(phrases):
    filtered_phrases = []
    for phrase in phrases:
        doc = nlp(phrase)
        filtered_tokens = [token.text for token in doc if token.pos_ in ("NOUN", "ADJ")]
        if filtered_tokens:
            filtered_phrases.append(" ".join(filtered_tokens))
    return filtered_phrases

simple_prompts = []
new_df = metadata_df.loc[:999, ['prompt']]

for prompt in new_df["prompt"]:
    rake.extract_keywords_from_text(prompt)
    keywords = rake.get_ranked_phrases()

    filtered_keywords = filter_nouns_and_adj(keywords)

    selected_keywords = filtered_keywords[:max(max_keywords, min_keywords)]
    if len(selected_keywords) < min_keywords and selected_keywords:
        selected_keywords.extend([selected_keywords[-1]] * (min_keywords - len(selected_keywords)))
    simple_prompt = ", ".join(selected_keywords)
    simple_prompts.append(simple_prompt)

new_df["simple_prompt"] = simple_prompts

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
pd.set_option('display.max_colwidth', None)
print(new_df[['prompt', 'simple_prompt']].head(10))

                                                                                                                                                                                   prompt  \
0                                                                        a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation    
1  a portrait of a female robot made from a cloud of images being very grateful to the creator, very intricate details, futuristic steampunk, octane render, 8 k, trending on artstation    
2                                                                                                                                           only memories remain, trending on artstation    
3                                                                                                                                                        dream swimming pool with nobody    
4                                                      

## Attempt: RAKE+spacy V2 (deprecated)

In [None]:
import pandas as pd
from rake_nltk import Rake
import nltk
import spacy

# 下载 NLTK 数据
nltk.download('stopwords')
nltk.download('punkt')

rake = Rake()
nlp = spacy.load("en_core_web_sm")

max_keywords = 3
min_keywords = 1

def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return entities

def filter_nouns_and_adj(phrases):
    filtered_phrases = []
    for phrase in phrases:
        doc = nlp(phrase)
        filtered_tokens = [token.text for token in doc if token.pos_ in ("NOUN", "ADJ")]
        if filtered_tokens:
            filtered_phrases.append(" ".join(filtered_tokens))
    return filtered_phrases

simple_prompts = []
new_df = metadata_df.loc[:99, ['prompt']]

for prompt in new_df["prompt"]:
    rake.extract_keywords_from_text(prompt)
    keywords = rake.get_ranked_phrases()

    entities = extract_entities(prompt)
    keywords += entities

    selected_keywords = keywords[:max(max_keywords, min_keywords)]
    if len(selected_keywords) < min_keywords and selected_keywords:
        selected_keywords.extend([selected_keywords[-1]] * (min_keywords - len(selected_keywords)))

    simple_prompt = ", ".join(selected_keywords)
    simple_prompts.append(simple_prompt)

new_df["simple_prompt"] = simple_prompts

pd.set_option('display.max_colwidth', None)
print(new_df[['prompt', 'simple_prompt']].head(10))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                                                                                                                                                   prompt  \
0                                                                        a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation    
1  a portrait of a female robot made from a cloud of images being very grateful to the creator, very intricate details, futuristic steampunk, octane render, 8 k, trending on artstation    
2                                                                                                                                           only memories remain, trending on artstation    
3                                                                                                                                                        dream swimming pool with nobody    
4                                                      

# Construct prompts-theme word pair dataset. RAKE+spacy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install rake_nltk

Collecting rake_nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake_nltk
Successfully installed rake_nltk-1.0.6


In [None]:
import pandas as pd
from rake_nltk import Rake
import nltk
import spacy

nltk.download('stopwords')
nltk.download('punkt')

rake = Rake()
nlp = spacy.load("en_core_web_sm")

max_keywords = 3
min_keywords = 1

# Use Spacy to construct simple theme words: only entities and their modifiers are retained.
def filter_nouns_and_adj(phrases):
    filtered_phrases = []
    for phrase in phrases:
        doc = nlp(phrase)
        filtered_tokens = [token.text for token in doc if token.pos_ in ("NOUN", "ADJ")]
        if filtered_tokens:
            filtered_phrases.append(" ".join(filtered_tokens))
    return filtered_phrases

# Even simpler theme words: only entities are retained.
def filter_nouns(phrases):
    filtered_phrases = []
    for phrase in phrases:
        doc = nlp(phrase)
        filtered_tokens = [token.text for token in doc if token.pos_ in ("NOUN")]
        if filtered_tokens:
            filtered_phrases.append(" ".join(filtered_tokens))
    return filtered_phrases

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from tqdm import tqdm
from time import time

simple_prompts = []
simplest_prompts = []
# Here shows the processing of 300,000 original prompts. For efficiency, I opened multiple notebooks simultaneously. Actually 800,000 prompts are processed.
new_df = metadata_df.loc[:299999, ['prompt']]

for prompt in tqdm(new_df["prompt"]):
    # Use RAKE to extract ranked keywords
    rake.extract_keywords_from_text(prompt)
    keywords = rake.get_ranked_phrases()

    # Retain at most three and at least one keyword as theme words.
    # In our experiments, this appears to be the best result. It neither misses key entities nor is it overly redundant.
    # Use Spacy filtering to remove irrelevant words from the results.
    filtered_keywords = filter_nouns_and_adj(keywords)
    selected_keywords = filtered_keywords[:max(max_keywords, min_keywords)]
    if len(selected_keywords) < min_keywords and selected_keywords:
        selected_keywords.extend([selected_keywords[-1]] * (min_keywords - len(selected_keywords)))
    simple_prompt = selected_keywords
    simple_prompts.append(simple_prompt)

    # Similar processing, but with more concise theme words. Only entities are retained.
    filtered_noun_keywords = filter_nouns(keywords)
    selected_noun_keywords = filtered_noun_keywords[:max(max_keywords, min_keywords)]
    if len(selected_noun_keywords) < min_keywords and selected_noun_keywords:
        selected_noun_keywords.extend([selected_noun_keywords[-1]] * (min_keywords - len(selected_noun_keywords)))
    simplest_prompt = selected_noun_keywords
    simplest_prompts.append(simplest_prompt)

new_df["simple_prompt"] = simple_prompts
new_df["simplest_prompt"] = simplest_prompts

100%|██████████| 300000/300000 [8:04:03<00:00, 10.33it/s]


In [None]:
# Display the results of extracting.
pd.set_option('display.max_colwidth', None)
print(new_df[['prompt', 'simple_prompt', 'simplest_prompt']].head(10))

                                                                                                                                                                                   prompt  \
0                                                                        a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation    
1  a portrait of a female robot made from a cloud of images being very grateful to the creator, very intricate details, futuristic steampunk, octane render, 8 k, trending on artstation    
2                                                                                                                                           only memories remain, trending on artstation    
3                                                                                                                                                        dream swimming pool with nobody    
4                                                      

In [None]:
# Save the results.
new_df.to_parquet('/content/drive/MyDrive/simplified_prompt_0_299999.parquet', index=False)

## construct single prompt to complex prompt dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

#
prompt_df_0_299999 = pd.read_parquet('/content/drive/MyDrive/simplified_prompt_0_299999.parquet')
prompt_df_300000_599999 = pd.read_parquet('/content/drive/MyDrive/simplified_prompt_300000_599999.parquet')
prompt_df_600000_799999 = pd.read_parquet('/content/drive/MyDrive/simplified_prompt_600000_799999.parquet')
prompt_df_0_599999 = pd.concat([prompt_df_0_299999, prompt_df_300000_599999], ignore_index=True)
prompt_df = pd.concat([prompt_df_0_599999, prompt_df_600000_799999], ignore_index=True)

In [None]:
print(len(prompt_df))

800000


In [None]:
import pandas as pd
from tqdm import tqdm

rows = []
for prompt, simple_prompt, simplest_prompt in zip(prompt_df["prompt"], prompt_df["simple_prompt"], prompt_df["simplest_prompt"]):
    for _simple_prompt in simple_prompt:
        rows.append({"prompt": prompt, "single_prompt": _simple_prompt})
    for _simplest_prompt in simplest_prompt:
        rows.append({"prompt": prompt, "single_prompt": _simplest_prompt})

single_prompt_df = pd.DataFrame(rows)
print(single_prompt_df.head(10))

                                              prompt      single_prompt
0  a portrait of a female robot made from code, v...       female robot
1  a portrait of a female robot made from code, v...             render
2  a portrait of a female robot made from code, v...  intricate details
3  a portrait of a female robot made from code, v...              robot
4  a portrait of a female robot made from code, v...             render
5  a portrait of a female robot made from code, v...            details
6  a portrait of a female robot made from a cloud...       female robot
7  a portrait of a female robot made from a cloud...             render
8  a portrait of a female robot made from a cloud...  intricate details
9  a portrait of a female robot made from a cloud...              robot


In [None]:
print(len(single_prompt_df))
print(len(single_prompt_df.drop_duplicates()))
single_prompt_df = single_prompt_df.drop_duplicates()

4240675
2503731


In [None]:
single_prompt_df.to_parquet("/content/drive/MyDrive/single_prompt_0_799999.parquet", index=False)

# Train the model: T5

## Attempt: GPT2 (deprecated)

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import Trainer, TrainingArguments, GPT2Tokenizer, GPT2LMHeadModel
from google.colab import drive
drive.mount('/content/drive')

single_prompt_df = pd.read_parquet('/content/drive/MyDrive/single_prompt_0_599999.parquet')
hf_dataset = Dataset.from_pandas(single_prompt_df)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2LMHeadModel.from_pretrained('gpt2')

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def preprocess_function(example):
    combined_text = f"{example['single_prompt']} -> {example['prompt']}"
    encoding = tokenizer(
        combined_text,
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    return encoding

tokenized_dataset = hf_dataset.map(preprocess_function, batched=False)

Map:   0%|          | 0/1907335 [00:00<?, ? examples/s]

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
tokenized_dataset.save_to_disk("/content/drive/MyDrive/tokenized_dataset")

Saving the dataset (0/4 shards):   0%|          | 0/1907335 [00:00<?, ? examples/s]

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["HF_TOKEN"] = "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"
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets



In [None]:
from datasets import load_from_disk

from google.colab import drive
drive.mount('/content/drive')

tokenized_dataset = load_from_disk('/content/drive/MyDrive/tokenized_dataset')
splits = tokenized_dataset.train_test_split(test_size=0.2)

train_dataset = splits['train']
eval_dataset = splits['test']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])

from transformers import Trainer, TrainingArguments, GPT2Tokenizer, GPT2LMHeadModel

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=100,
    save_total_limit=2
)


model = GPT2LMHeadModel.from_pretrained('gpt2')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()



IndexError: index out of range in self

## Tokenize

In [None]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets

import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

from datasets import load_from_disk

from google.colab import drive
drive.mount('/content/drive')

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m235.5/302.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [None]:
single_prompt_df = pd.read_parquet('/content/drive/MyDrive/single_prompt_0_799999.parquet')

tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Tokenize the inputs and targets to enable training using cross-entropy loss.
def preprocess_function(example):
    input_encoding = tokenizer(
        example['single_prompt'],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    target_encoding = tokenizer(
        example['prompt'],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": target_encoding["input_ids"]
    }

hf_dataset = Dataset.from_pandas(single_prompt_df)
tokenized_dataset = hf_dataset.map(preprocess_function, batched=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2503731 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset.save_to_disk("/content/drive/MyDrive/tokenized_dataset_T5_0_799999")

Saving the dataset (0/10 shards):   0%|          | 0/2503731 [00:00<?, ? examples/s]

In [None]:
print(single_prompt_df.head(10))

                                              prompt      single_prompt
0  a portrait of a female robot made from code, v...       female robot
1  a portrait of a female robot made from code, v...             render
2  a portrait of a female robot made from code, v...  intricate details
3  a portrait of a female robot made from code, v...              robot
4  a portrait of a female robot made from code, v...            details
5  a portrait of a female robot made from a cloud...       female robot
6  a portrait of a female robot made from a cloud...             render
7  a portrait of a female robot made from a cloud...  intricate details
8  a portrait of a female robot made from a cloud...              robot
9  a portrait of a female robot made from a cloud...            details


## Train (deprecated)

In [None]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
from datasets import load_from_disk

from google.colab import drive
drive.mount('/content/drive')

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m256.0/302.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
tokenized_dataset = load_from_disk("/content/drive/MyDrive/tokenized_dataset_T5_0_799999")
splits = tokenized_dataset.train_test_split(test_size=0.2)

train_dataset = splits['train']
eval_dataset = splits['test']

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/models',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    save_steps=1000,
    save_total_limit=100
)

model = T5ForConditionalGeneration.from_pretrained('t5-small')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Step,Training Loss
500,2.302
1000,1.5486
1500,1.476
2000,1.4358
2500,1.4042
3000,1.3819
3500,1.3516
4000,1.3351
4500,1.3194
5000,1.3111


Step,Training Loss
500,2.302
1000,1.5486
1500,1.476
2000,1.4358
2500,1.4042
3000,1.3819
3500,1.3516
4000,1.3351
4500,1.3194
5000,1.3111


TrainOutput(global_step=93891, training_loss=1.1115122494440237, metrics={'train_runtime': 50357.2842, 'train_samples_per_second': 119.326, 'train_steps_per_second': 1.864, 'total_flos': 2.0331559715969434e+17, 'train_loss': 1.1115122494440237, 'epoch': 3.0})

In [None]:
trainer.save_model('/content/drive/MyDrive/t5-small_0_799999')

In [1]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from google.colab import drive
drive.mount('/content/drive')

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m286.7/302.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

KeyboardInterrupt: 

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/t5-small_0_799999').to(device)
tokenizer = T5Tokenizer.from_pretrained('t5-small')

import torch
model.eval()


input_prompt = "robot"

inputs = tokenizer.encode(input_prompt, return_tensors='pt', max_length=128, truncation=True).to(device)

outputs = model.generate(inputs, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input prompt: {input_prompt}")
print(f"Generated output: {generated_text}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input prompt: robot
Generated output: a robot with horns and thorned eyes, symetrical, intricate, elegant, highly detailed, digital painting, artstation, concept art, smooth, sharp focus, illustration, by greg rut


## Train: lr=2e-5 (deprecated)

In [1]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

from datasets import load_from_disk

from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer
from datasets import load_from_disk

tokenized_dataset = load_from_disk("/content/drive/MyDrive/tokenized_dataset_T5_0_799999")

In [None]:
splits = tokenized_dataset.train_test_split(test_size=0.01)

train_dataset = splits['train']
eval_dataset = splits['test']

In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the T5 model and its tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Prepare the training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/results_t5_2e-5',  # Where to store the training outputs
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="steps",  # Evaluate at the end of each epoch
    eval_steps=500,
    learning_rate=2e-5,           # Learning rate
    per_device_train_batch_size=64,  # Batch size for training
    per_device_eval_batch_size=64,
    weight_decay=0.01,            # Weight decay for regularization
    save_steps=2000,
    save_total_limit=200,           # Only last 3 models are saved; older ones are deleted
    num_train_epochs=2            # Total number of training epochs
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=splits['train'],
    eval_dataset=splits['test'],
    tokenizer=tokenizer  # Make sure the tokenizer is defined
)

# Train the model
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step,Training Loss,Validation Loss
500,3.1406,1.65441
1000,1.7069,1.527086
1500,1.6088,1.47015
2000,1.5548,1.431539
2500,1.5214,1.402854
3000,1.4891,1.380711
3500,1.4652,1.362228
4000,1.4595,1.34611
4500,1.4317,1.331827
5000,1.422,1.319968


Step,Training Loss,Validation Loss
500,3.1406,1.65441
1000,1.7069,1.527086
1500,1.6088,1.47015
2000,1.5548,1.431539
2500,1.5214,1.402854
3000,1.4891,1.380711
3500,1.4652,1.362228
4000,1.4595,1.34611
4500,1.4317,1.331827
5000,1.422,1.319968


TrainOutput(global_step=77460, training_loss=1.2486952565235283, metrics={'train_runtime': 51296.2552, 'train_samples_per_second': 96.642, 'train_steps_per_second': 1.51, 'total_flos': 1.6773538795801805e+17, 'train_loss': 1.2486952565235283, 'epoch': 2.0})

In [15]:
trainer.save_model('/content/drive/MyDrive/t5-small_0_799999-2e-5')

In [None]:
import torch
model.eval()

model.to(device)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
input_prompt = "female robot"

inputs = tokenizer.encode(input_prompt, return_tensors='pt', max_length=128, truncation=True).to(device)

outputs = model.generate(inputs, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input prompt: {input_prompt}")
print(f"Generated output: {generated_text}")

## Train: lr=1e-2

In [3]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

from datasets import load_from_disk

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
tokenized_dataset = load_from_disk("/content/drive/MyDrive/tokenized_dataset_T5_0_799999")

In [8]:
# Split training and eval dataset. Use 5% of the original dataset as the validation set.
splits = tokenized_dataset.train_test_split(test_size=0.05)

train_dataset = splits['train']
eval_dataset = splits['test']

In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the T5 model and its tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Prepare the training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/results_t5_1e-2',
    logging_strategy="steps",
    logging_steps=2000,
    evaluation_strategy="steps",    # Report training and eval losses every 200 steps
    eval_steps=2000,
    learning_rate=1e-2,        # Learning rate
    per_device_train_batch_size=64,  # Batch size for training
    per_device_eval_batch_size=64,
    weight_decay=1e-2,         # Weight decay for regularization
    save_steps=2000,
    save_total_limit=200,
    num_train_epochs=2         # Train 2 epochs
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=splits['train'],
    eval_dataset=splits['test'],
    tokenizer=tokenizer
)

# Train the model
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step,Training Loss,Validation Loss
2000,1.1888,1.002035
4000,1.0327,0.932736
6000,0.9875,0.890722
8000,0.9483,0.865042
10000,0.925,0.844578
12000,0.9005,0.817651
14000,0.8806,0.799933
16000,0.8635,0.783395
18000,0.8493,0.772674
20000,0.8382,0.759221


Step,Training Loss,Validation Loss
2000,1.1888,1.002035
4000,1.0327,0.932736
6000,0.9875,0.890722
8000,0.9483,0.865042
10000,0.925,0.844578
12000,0.9005,0.817651
14000,0.8806,0.799933
16000,0.8635,0.783395
18000,0.8493,0.772674
20000,0.8382,0.759221


TrainOutput(global_step=74330, training_loss=0.7642357080922254, metrics={'train_runtime': 50765.8467, 'train_samples_per_second': 93.706, 'train_steps_per_second': 1.464, 'total_flos': 1.609582149202084e+17, 'train_loss': 0.7642357080922254, 'epoch': 2.0})

In [10]:
trainer.save_model('/content/drive/MyDrive/t5-small_0_799999-1e-2')

# Examine the model's output.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
import torch
model.eval()

model.to(device)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define the input theme word
input_prompt = "sculpture"
inputs = tokenizer.encode(input_prompt, return_tensors='pt', max_length=128, truncation=True).to(device)
# Generate output prompt using the model. The different parameters of generate() affect the output results.
outputs = model.generate(inputs, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input prompt: {input_prompt}")
print(f"Generated output: {generated_text}")

Input prompt: sculpture
Generated output: a sculpture of emma watson, symetrical, intricate, elegant, highly detailed, digital painting, artstation, concept art, smooth, sharp focus, illustration, by greg rutkowski and alphonse mucha


In [None]:
input_prompt = "sculpture"

inputs = tokenizer.encode(input_prompt, return_tensors='pt', max_length=128, truncation=True).to(device)
# Use different set of parameters
outputs = model.generate(inputs, max_length=100, num_return_sequences=1, no_repeat_ngram_size=1, early_stopping=True)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input prompt: {input_prompt}")
print(f"Generated output: {generated_text}")

Input prompt: sculpture
Generated output: a sculpture of an ancient god, by artgerm and alphonse mucho


In [None]:
input_prompt = "sculpture"

inputs = tokenizer.encode(input_prompt, return_tensors='pt', max_length=128, truncation=False).to(device)

beam_size = 4
temperature = 0.9
length_penalty = 1.0
outputs = model.generate(
    inputs,
    max_length=100,
    num_beams=beam_size,
    temperature=temperature,
    no_repeat_ngram_size=2,
    length_penalty=length_penalty,
    early_stopping=True
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Input prompt: {input_prompt}")
print(f"Generated output: {generated_text}")

Input prompt: sculpture
Generated output: a beautiful sculpture of emma watson in the style of artgerm and greg rutkowski and magali villeneuve


In [None]:
input_prompt = "sculpture"

inputs = tokenizer.encode(input_prompt, return_tensors='pt', max_length=128, truncation=False).to(device)

beam_size = 6
temperature = 0.9
length_penalty = 3.0
outputs = model.generate(
    inputs,
    max_length=100,
    num_beams=beam_size,
    temperature=temperature,
    no_repeat_ngram_size=2,
    length_penalty=length_penalty,
    early_stopping=True
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Input prompt: {input_prompt}")
print(f"Generated output: {generated_text}")

Input prompt: sculpture
Generated output: a highly detailed sculpture of emma watson in the style of artgerm and greg rutkowski and alphonse mucha
