In [1]:
#You need important_words.parquet (drive or downloads) to skip creating it
#You need train_dataset and val_dataset (blank and 2 from drive) to skip creating datasets for the two models
#The two models will be saved in results and results2 folders in drive
#both models trained for 3 epochs
#first model loss started at 3.5 ended at 1.74
#first model: 3'rd epoch	train_loss:1.741200	val_loss:1.816134
#second model first half data:
#epoch 1: train: 2.19 valid: 2.13
#epoch 3: train:2.132000	valid:2.102920
import polars as pl

splits = {'train': 'data/train-*.parquet', 'test': 'data/test-00000-of-00001-16503b0c26ed00c6.parquet', 'validation': 'data/validation-00000-of-00001-137b93e1e979d138.parquet'}
df = pl.read_parquet('hf://datasets/euclaise/writingprompts/' + splits['train'])


In [2]:
import pandas as pd
prompts = df['prompt']
stories = df['story']

In [None]:
#To create important_words using top 200 words - tf idf
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_important_words(stories, top_n=200):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
    X = vectorizer.fit_transform(stories)
    important_words = []
    for i in range(X.shape[0]):
        indices = X[i].indices
        features = [vectorizer.get_feature_names_out()[j] for j in indices]
        important_words.append(" ".join(features))
    return important_words



important_words = extract_important_words(stories)


In [3]:
!pip install pyarrow



In [None]:
#saves important_words to a parquet file
dfimp = pd.DataFrame({
    'Prompt': prompts,
    'Important Words': important_words
})

# Save the DataFrame to a Parquet file
dfimp.to_parquet('important_words.parquet', engine='pyarrow')

# Print the DataFrame to verify
print(dfimp)

                                                   Prompt  \
0       [ WP ] You 've finally managed to discover the...   
1       [ WP ] The moon is actually a giant egg , and ...   
2       [ WP ] You find a rip in time walking through ...   
3       [ WP ] For years in your youth the same imagin...   
4       [ WP ] You glance at your watch 10:34 am , rou...   
...                                                   ...   
272595  [ WP ] You wake up , extremely thirsty and dre...   
272596  [ WP ] After many years , you finally decide t...   
272597  [ WP ] In a world where people can only be kil...   
272598  [ WP ] Use a lyric from a song , or even the w...   
272599  [ CW ] [ PM ] Write your hero into a corner , ...   

                                          Important Words  
0       times walked got used time feel black way able...  
1       got feel human heart ca hear right course eart...  
2       got time way years past felt long like ca just...  
3       times time way year

In [5]:
#this is after uploading important_words.
#don't run anything before this except the first and second cells
df2 = pl.read_parquet('/content/important_words.parquet')
important_words = df2['Important Words'].to_list()

In [None]:
#with test train - first time
#to tokenize prompts and important_words and train the first model
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results'
# Function to tokenize texts

def tokenize_texts(tokenizer, texts, max_length=1024):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length)

# Example input_texts and output_texts (replace these with your actual data)
input_texts = prompts
output_texts = important_words

# Train-test split
input_train, input_val, output_train, output_val = train_test_split(input_texts, output_texts, test_size=0.2)
print(type(input_train))

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
first_model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize the train and validation texts
train_input_encodings = tokenize_texts(tokenizer, input_train.to_list())
train_output_encodings = tokenize_texts(tokenizer, output_train)
val_input_encodings = tokenize_texts(tokenizer, input_val.to_list())
val_output_encodings = tokenize_texts(tokenizer, output_val)

# Prepare dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, output_encodings):
        self.input_encodings = input_encodings
        self.output_encodings = output_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.input_encodings['attention_mask'][idx])
        labels = torch.tensor(self.output_encodings['input_ids'][idx])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }

# Create train and validation datasets
train_dataset = CustomDataset(train_input_encodings, train_output_encodings)
val_dataset = CustomDataset(val_input_encodings, val_output_encodings)
torch.save(train_dataset, '/content/drive/My Drive/train_dataset.pt')
torch.save(val_dataset,'/content/drive/My Drive/val_dataset.pt')

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
)

# Initialize the Trainer
trainer = Trainer(
    model=first_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Include the validation dataset
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()
print(evaluation_results)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'polars.series.series.Series'>




Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,2.3339,2.162606


KeyboardInterrupt: 

In [10]:

#resume from checkpoint from drive
#resume training the first model after loading tokenized inputs and outputs
#from drive

from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results'

first_model = GPT2LMHeadModel.from_pretrained(output_dir + '/checkpoint-130000')
print(output_dir + '/checkpoint-130000')

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Prepare dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, output_encodings):
        self.input_encodings = input_encodings
        self.output_encodings = output_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.input_encodings['attention_mask'][idx])
        labels = torch.tensor(self.output_encodings['input_ids'][idx])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }

train_dataset = torch.load('/content/drive/My Drive/train_dataset.pt')
#If this doesn't work, ask ChatGPT how to load/create the CustomDataset again from this saved thing
val_dataset = torch.load('/content/drive/My Drive/val_dataset.pt')

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
)

trainer = Trainer(
    model=first_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train(resume_from_checkpoint=True)

# Evaluate the model
evaluation_results = trainer.evaluate()
print(evaluation_results)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/results/checkpoint-130000


  train_dataset = torch.load('/content/drive/My Drive/train_dataset.pt')
  val_dataset = torch.load('/content/drive/My Drive/val_dataset.pt')
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss
3,1.7412,1.816134


{'eval_loss': 1.816133975982666, 'eval_runtime': 414.2991, 'eval_samples_per_second': 131.596, 'eval_steps_per_second': 16.449, 'epoch': 3.0}


Here start tasks pertaining to the second model

In [6]:
#to train second model - data preparation prerequisite
#data split into test-train and tokenizer initialized
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
# Example input_texts and output_texts (replace these with your actual data)
input_texts = important_words
output_texts = stories

# Train-test split
input_train, input_val, output_train, output_val = train_test_split(input_texts, output_texts, test_size=0.2)
print(type(output_train))

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

<class 'polars.series.series.Series'>


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [5]:
#data preparation - first time 2nd model part 1
import torch

from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results2'
# Function to tokenize texts

def tokenize_texts(tokenizer, texts, max_length=1024):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length)
data = output_train.to_list()
split_index = len(data)//2
dataset_part1 = data[:split_index]
dataset_part2 = data[split_index:]

# Tokenize the train and validation texts
train_output_encodings = tokenize_texts(tokenizer, dataset_part1)
torch.save(train_output_encodings,'/content/drive/My Drive/train_output_encodings1.pt')


Mounted at /content/drive


In [5]:
#data preparation - first time 2nd model part 1
import torch

from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results2'
# Function to tokenize texts

def tokenize_texts(tokenizer, texts, max_length=1024):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length)
data = output_train.to_list()
split_index = len(data)//2
dataset_part1 = data[:split_index]
dataset_part2 = data[split_index:]

# Tokenize the train and validation texts
train_output_encodings = tokenize_texts(tokenizer, dataset_part2)
torch.save(train_output_encodings,'/content/drive/My Drive/train_output_encodings2.pt')

Mounted at /content/drive


In [5]:
#data preparation - first time 2nd model part 1
import torch

from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results2'
# Function to tokenize texts

def tokenize_texts(tokenizer, texts, max_length=1024):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length)

data = input_train
split_index = len(data)//2
dataset_part1 = data[:split_index]
dataset_part2 = data[split_index:]

train_input_encodings = tokenize_texts(tokenizer, dataset_part1)
torch.save(train_input_encodings,'/content/drive/My Drive/train_input_encodings1.pt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#data preparation - first time 2nd model part 1
import torch

from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results2'
# Function to tokenize texts

def tokenize_texts(tokenizer, texts, max_length=1024):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length)

data = input_train
split_index = len(data)//2
dataset_part1 = data[:split_index]
dataset_part2 = data[split_index:]

train_input_encodings = tokenize_texts(tokenizer, dataset_part2)
torch.save(train_input_encodings,'/content/drive/My Drive/train_input_encodings2.pt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#data preparation - first time 2nd model part 1
from google.colab import drive
import torch
# Mount Google Drive to access saved files
drive.mount('/content/drive')
train_output_encodings = torch.load('/content/drive/MyDrive/train_output_encodings1.pt')
train_input_encodings = torch.load('/content/drive/MyDrive/train_input_encodings1.pt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  train_output_encodings = torch.load('/content/drive/MyDrive/train_output_encodings1.pt')
  train_input_encodings = torch.load('/content/drive/MyDrive/train_input_encodings1.pt')


In [6]:
#data preparation - first time 2nd model part 1
# Prepare dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, output_encodings):
        self.input_encodings = input_encodings
        self.output_encodings = output_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.input_encodings['attention_mask'][idx])
        labels = torch.tensor(self.output_encodings['input_ids'][idx])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }

# Create train and validation datasets
train_dataset2 = CustomDataset(train_input_encodings, train_output_encodings)

torch.save(train_dataset2, '/content/drive/My Drive/train_dataset21.pt')


In [7]:
#data preparation - first time 2nd model part 2
from google.colab import drive
import torch
# Mount Google Drive to access saved files
drive.mount('/content/drive')
train_output_encodings = torch.load('/content/drive/MyDrive/train_output_encodings2.pt')
train_input_encodings = torch.load('/content/drive/MyDrive/train_input_encodings2.pt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  train_output_encodings = torch.load('/content/drive/MyDrive/train_output_encodings2.pt')
  train_input_encodings = torch.load('/content/drive/MyDrive/train_input_encodings2.pt')


In [9]:
#data preparation - first time 2nd model part 2
# Prepare dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, output_encodings):
        self.input_encodings = input_encodings
        self.output_encodings = output_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.input_encodings['attention_mask'][idx])
        labels = torch.tensor(self.output_encodings['input_ids'][idx])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }

# Create train and validation datasets
train_dataset2 = CustomDataset(train_input_encodings, train_output_encodings)

torch.save(train_dataset2, '/content/drive/My Drive/train_dataset22.pt')

In [7]:
#data preparation - first time 2nd model part 3
import torch

from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results2'
# Function to tokenize texts

def tokenize_texts(tokenizer, texts, max_length=1024):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length)

val_input_encodings = tokenize_texts(tokenizer, input_val)
val_output_encodings = tokenize_texts(tokenizer, output_val.to_list())

# Prepare dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, output_encodings):
        self.input_encodings = input_encodings
        self.output_encodings = output_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.input_encodings['attention_mask'][idx])
        labels = torch.tensor(self.output_encodings['input_ids'][idx])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }

val_dataset2 = CustomDataset(val_input_encodings, val_output_encodings)
torch.save(val_dataset2,'/content/drive/My Drive/val_dataset2.pt')

Mounted at /content/drive


In [3]:
# from drive - data preparation
#resume from checkpoint from drive
#load tokenized data from drive

from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results2'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Prepare dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, output_encodings):
        self.input_encodings = input_encodings
        self.output_encodings = output_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.input_encodings['attention_mask'][idx])
        labels = torch.tensor(self.output_encodings['input_ids'][idx])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }

train_dataset = torch.load('/content/drive/My Drive/train_dataset22.pt')
#If this doesn't work, ask ChatGPT how to load/create the CustomDataset again from this saved thing
val_dataset = torch.load('/content/drive/My Drive/val_dataset2.pt')

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

  train_dataset = torch.load('/content/drive/My Drive/train_dataset22.pt')
  val_dataset = torch.load('/content/drive/My Drive/val_dataset2.pt')


In [5]:
#training
#second_model = GPT2LMHeadModel.from_pretrained('gpt2')
output_dir = '/content/drive/My Drive/results2'

#from drive
second_model = GPT2LMHeadModel.from_pretrained(output_dir + '/checkpoint-70000')

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
)

# Initialize the Trainer
trainer2 = Trainer(
    model=second_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Include the validation dataset
)

# Train the model
#trainer2.train()
#from drive
trainer2.train(resume_from_checkpoint=True)

# Evaluate the model
evaluation_results = trainer2.evaluate()
print(evaluation_results)

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Epoch,Training Loss,Validation Loss


{'eval_loss': 2.1029202938079834, 'eval_runtime': 692.1307, 'eval_samples_per_second': 78.771, 'eval_steps_per_second': 9.846, 'epoch': 3.0}


In [50]:
def generate_important_words(prompt,model,tokenizer):
    input_text = prompt
    encoding = tokenizer.encode_plus(
    input_text,
    return_tensors="pt",
    padding=True,
    truncation=True
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Generate text
    output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=200,  # Make sure it's longer than the input
    num_return_sequences=3,  # Generate 3 different sequences
    do_sample=True,  # Enable sampling to introduce variability
    top_k=50,  # Top-k sampling
    top_p=0.95,  # Top-p sampling
    temperature=0.8,  # Adjust temperature to control randomness
    repetition_penalty=1.2,  # Penalize repeated tokens
    no_repeat_ngram_size=3,  # Prevent repeating n-grams of size 3
    pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to eos_token_id
    )
    generated_texts = []
    for seq in output:
      text = tokenizer.decode(seq, skip_special_tokens=True)
      # Remove unwanted patterns (e.g., //wp)
      cleaned_text = re.sub(r'//.*?wp|//.*?\n', '', text)  # Remove patterns like //wp, or //newlines
      cleaned_text = re.sub(r'[^\w\s,.!?;:]', '', cleaned_text)  # Remove any non-alphanumeric characters except common punctuation

      # Ensure to remove the exact input prompt from the generated output
      prompt_length = len(tokenizer.decode(input_ids[0], skip_special_tokens=True))
      cleaned_text = cleaned_text[prompt_length:].strip()  # Strip the prompt part from the beginning
      generated_texts.append(cleaned_text)
    return generated_texts


In [39]:
def generate_full_story(important_words,model,tokenizer):
    input_text = important_words
    encoding = tokenizer.encode_plus(
    input_text,
    return_tensors="pt",
    padding=True,
    truncation=True
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Generate text
    output = model.generate(
    input_ids,
    attention_mask=attention_mask,  # Provide the attention mask
    max_length=700,
    num_return_sequences=3,
    do_sample=True,  # Enable sampling to introduce variability
    top_k=50,  # Top-k sampling
    top_p=0.95,  # Top-p sampling
    temperature=0.9,  # Adjust temperature to control randomness
    repetition_penalty=5.3  # Penalize repeated tokens
    )
    full_story = tokenizer.decode(output[0], skip_special_tokens=True)
    return full_story


In [54]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from google.colab import drive
import torch
import re

drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/results'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

output_dir2 = '/content/drive/My Drive/results2'

#from drive
first_model = GPT2LMHeadModel.from_pretrained(output_dir + '/checkpoint-163560')
second_model = GPT2LMHeadModel.from_pretrained("gpt2")

imp = generate_important_words("In the heart of the ancient kingdom of Eldoria, a young mage named Lyra discovered a forgotten prophecy hidden within the pages of an old, dusty tome. It spoke of a celestial alignment that would awaken the Dragon of Eternity, a creature of immense power, capable of reshaping the world. As the stars began to align, Lyra embarked on a perilous journey to find the mythical Dragon’s Lair, deep within the Forbidden Mountains. Along the way, she encountered a rogue knight with a mysterious past, an enchanted forest full of secrets, and a shadowy figure determined to stop her at all costs.",first_model,tokenizer)
print(imp)
stor = generate_full_story(' '.join(imp), second_model,tokenizer)
print(stor)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['WP \n OT \n EU?\n IP\n\n FF  A dragon, written by Anonymous!\n CW\n TT  XforTales, by Anonymous  XNovas\n RF\n PI\n wyr ; XPosters here.\n CC : All hail for submissions! Thanks to all!!\n corsairnging', 'WP   EU .\n OT : How long is it?  Xpost from rfifthwall\n FF1  Youve been invited to write a story based around this!? Prompt : A dark, original prompt for ages 1 through 20\n CW : What happens on ragonist  s journey\n IP address\n What happens when you finish', 'WP \n CW \n EU! Write about a story where the first dragon has appeared\n OT?\n\n\n A story about how it was discovered!\n FF!\n  Now in the wild west, in the style...\n IP!. No, wait!\n TT!\n You are a fantasy reader!\n\n Ive just turned 16']
WP 
 OT 
 EU?
 IP

 FF  A dragon, written by Anonymous!
 CW
 TT  XforTales, by Anonymous  XNovas
 RF
 PI
 wyr ; XPosters here.
 CC : All hail for submissions! Thanks to all!!
 corsairnging WP   EU.
 OT : How long is it?  Xpost from rfifthwall
 FF1  Youve been invited to write a story b