# TEXT GENERATION USING GPT-2

In [None]:
# Install the transformers package if you haven't already
# !pip install transformersimport torch

In [None]:
import torch
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer

from sklearn.model_selection import train_test_split

os.environ["WANDB_DISABLED"] = "true"

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2-xl"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

## FINE-TUNE GPT-2 TO SPEAK IN SOMEONE'S STYLE!

In [None]:
if not os.path.exists("models"):
    !mkdir models

In [None]:
df = pd.read_csv(os.path.join("datasets", "final_df_raw_sentences.csv"))
print(f"Choose one of these speakers: {df['speaker'].unique()}")

In [None]:
speaker = 'LEXFRIDMAN'

model_name = 'gpt2_' + speaker
model_dir = os.path.join("./models", model_name)

train_path = os.path.join(model_dir, 'train_data.txt')
test_path = os.path.join(model_dir, 'test_data.txt')

!mkdir {model_dir}

In [None]:
speaker_series = df[df['speaker'] == speaker]['sentences']

# Split the data into training and testing sets (80% train, 20% test)
train, test = train_test_split(speaker_series, test_size=0.2, random_state=11)

In [None]:
speaker_series = df[df['speaker'] == speaker]['sentences']

# Split the data into training and testing sets (80% train, 20% test)
train, test = train_test_split(speaker_series, test_size=0.2, random_state=11)

# Function to write sentences to a file, one per line
def write_to_file(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for sentence in data:
            file.write(sentence.strip() + '\n')

# Write the training and testing data to files
write_to_file(train, train_path)
write_to_file(test, test_path)

In [None]:
# Tokenize the text
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=test_path,
    block_size=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

# Training settings
training_args = TrainingArguments(
    output_dir = model_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained(model_dir)


## TEXT GENERATION AND MODEL TESTS

In [None]:
# Function to generate text
def generate_text(prompt, length=200):
    # Encode the prompt text
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    # Generate text
    output = model.generate(input_ids, max_length=length, num_return_sequences=1, no_repeat_ngram_size=2)
    
    # Decode and return the generated text
    return tokenizer.decode(output[0], skip_special_tokens=True)

Kanye West

In [None]:
path = os.path.join("models", "gpt2_KANYEWEST")
model.from_pretrained(path)

In [None]:
print("--------------------------------------------------------------------------------------------")

prompt = "I am the most"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "I think that technology is"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "My mission would be"
generated_text = generate_text(prompt, 50)
print(generated_text)

Andrew Huberman

In [None]:
# path = os.path.join("models", "gpt2_ANDREWHUBERMAN")
# model.from_pretrained(path)

In [None]:
print("--------------------------------------------------------------------------------------------")

prompt = "I am the most"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "I think that technology is"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "My mission would be"
generated_text = generate_text(prompt, 50)
print(generated_text)

Lex Fridman

In [None]:
path = os.path.join("models", "gpt2_LEXFRIDMAN")
model.from_pretrained(path).to(device)

In [None]:
print("--------------------------------------------------------------------------------------------")

prompt = "I am the most"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "I think that technology is"
generated_text = generate_text(prompt, 100)
print(generated_text)

print("--------------------------------------------------------------------------------------------")

prompt = "My mission would be"
generated_text = generate_text(prompt, 50)
print(generated_text)