# TEXT GENERATION USING GPT-2

In [None]:
# Install the transformers package if you haven't already
# !pip install transformersimport torch

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Get the text from an speaker and create train and test files

In [None]:
df = pd.read_csv(r"datasets\treshablantes_masde3palabras_df_preparado.csv")
print(f"Choose one of these speakers: {df['category'].unique()}")

In [None]:
speaker = 'ANDREWHUBERMAN'

speaker_series = df[df['category'] == speaker]['title']

# Split the data into training and testing sets (80% train, 20% test)
train, test = train_test_split(speaker_series, test_size=0.2, random_state=11)

# Function to write sentences to a file, one per line
def write_to_file(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for sentence in data:
            file.write(sentence.strip() + '\n')

# Write the training and testing data to files
write_to_file(train, 'train_data.txt')
write_to_file(test, 'test_data.txt')

In [None]:

# Prepare your dataset (adjust path to your file)
train_path = 'train_data.txt'
test_path = 'test_data.txt'

# Tokenize the text
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=test_path,
    block_size=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

# Training settings
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2-finetuned")
