# TEXT GENERATION USING GPT-2

In [1]:
# Install the transformers package if you haven't already
# !pip install transformersimport torch

In [2]:
import torch
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer

from sklearn.model_selection import train_test_split

os.environ["WANDB_DISABLED"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2-xl"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

## FINE-TUNE GPT-2 TO SPEAK IN SOMEONE'S STYLE!

In [None]:
if not os.path.exists("models"):
    !mkdir models

In [4]:
df = pd.read_csv(os.path.join("datasets", "final_df_raw_sentences.csv"))
print(f"Choose one of these speakers: {df['speaker'].unique()}")

Choose one of these speakers: ['ANDREW' 'ANDREWHUBERMAN' 'BENSHAPIRO' 'EDWARDSNOWDEN' 'GEORGEHOTZ'
 'GUIDOVANROSSUM' 'HANCOCK' 'JOEROGAN' 'JOEYDIAZ' 'KANYEWEST' 'KEVINHART'
 'LEXFRIDMAN' 'MARKZUCKERBERG' 'MARQUES' 'MATTHEWMCCOUNAGHEY' 'MIKETYSON'
 'MILEYCYRUS' 'MRBEAST' 'NEILDEGRASSE' 'POSTMALONE']


In [5]:
speaker = 'LEXFRIDMAN'

model_name = 'gpt2_' + speaker
model_dir = os.path.join("./models", model_name)

train_path = os.path.join(model_dir, 'train_data.txt')
test_path = os.path.join(model_dir, 'test_data.txt')

!mkdir {model_dir}

mkdir: cannot create directory ‘./models/gpt2_LEXFRIDMAN’: File exists


In [6]:
speaker_series = df[df['speaker'] == speaker]['sentences']

# Split the data into training and testing sets (80% train, 20% test)
train, test = train_test_split(speaker_series, test_size=0.2, random_state=11)

In [7]:
speaker_series = df[df['speaker'] == speaker]['sentences']

# Split the data into training and testing sets (80% train, 20% test)
train, test = train_test_split(speaker_series, test_size=0.2, random_state=11)

# Function to write sentences to a file, one per line
def write_to_file(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for sentence in data:
            file.write(sentence.strip() + '\n')

# Write the training and testing data to files
write_to_file(train, train_path)
write_to_file(test, test_path)

In [8]:
# Tokenize the text
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=test_path,
    block_size=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

# Training settings
training_args = TrainingArguments(
    output_dir = model_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained(model_dir)


Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


***** Running training *****
  Num examples = 428
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 321
100%|██████████| 321/321 [00:45<00:00,  7.31it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 321/321 [00:45<00:00,  7.09it/s]
Configuration saved in ./models/gpt2_LEXFRIDMAN/config.json


{'train_runtime': 45.2523, 'train_samples_per_second': 28.374, 'train_steps_per_second': 7.094, 'train_loss': 3.6998165522780373, 'epoch': 3.0}


Model weights saved in ./models/gpt2_LEXFRIDMAN/pytorch_model.bin


## TEXT GENERATION AND MODEL TESTS

In [9]:
# Function to generate text
def generate_text(prompt, length=200):
    # Encode the prompt text
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    # Generate text
    output = model.generate(input_ids, max_length=length, num_return_sequences=1, no_repeat_ngram_size=2)
    
    # Decode and return the generated text
    return tokenizer.decode(output[0], skip_special_tokens=True)

Kanye West

In [24]:
path = os.path.join("models", "gpt2_KANYEWEST")
model.from_pretrained(path)

loading configuration file models/gpt2_KANYEWEST/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 50257
}

loading

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [25]:
print("--------------------------------------------------------------------------------------------")

prompt = "I am the most"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "I think that technology is"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "My mission would be"
generated_text = generate_text(prompt, 50)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--------------------------------------------------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I am the most beautiful thing in the world.
And I think that's the greatest thing ever. And I'm not going to lie to you. I mean, I don't know if you're a fan of the show, but I love the podcast. So I'll just say, you know, it's a great thing to be able to do. But I just think it is a very, uh, a really, really good thing. It's like, well, the thing is
--------------------------------------------------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I think that technology is a very powerful thing.
So you're saying that you think it's a good thing to have a human being. But you don't know what that means. I mean, I think the human race is going to be a lot more efficient than the AI. So I'm not going into that.
--------------------------------------------------------------------------------------------
My mission would be to help people.
So, I think the thing that's interesting about the way that is the nature of the AI is that you can manipulate the human mind. So, you're going to have to be a lot of fun


Andrew Huberman

In [16]:
# path = os.path.join("models", "gpt2_ANDREWHUBERMAN")
# model.from_pretrained(path)

In [None]:
print("--------------------------------------------------------------------------------------------")

prompt = "I am the most"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "I think that technology is"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "My mission would be"
generated_text = generate_text(prompt, 50)
print(generated_text)

Lex Fridman

In [None]:
path = os.path.join("models", "gpt2_LEXFRIDMAN")
model.from_pretrained(path).to(device)

In [23]:
print("--------------------------------------------------------------------------------------------")

prompt = "I am the most"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "I think that technology is"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "My mission would be"
generated_text = generate_text(prompt, 50)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--------------------------------------------------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I am the most powerful man in the world.
I'm the one who is the greatest. I'm not a big fan of the media. But I think that's a good thing. And I don't think it's the only thing that can be done. It's not like, you know, I can't believe it. You know what I mean?
So I just want to ask you, what is your favorite thing? I have to say, it feels like a lot of
--------------------------------------------------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I think that technology is a good thing.
And I think it's a very powerful tool. I mean, I'm not saying that's not a bad thing, but I don't think there's any doubt that it is. And I would love to see it. But I do think the AI is going to be a big part of the future of civilization. So I just think you're going nuts.
--------------------------------------------------------------------------------------------
My mission would be to make sure that you're not just a dick, but a fucking dick.
And I think that's what's going on in the world right now. I mean, I'm not saying that I don't think it's
