In [1]:
import pandas as pd

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [2]:
df_sentiment = pd.read_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/Final_DF/sentiment_analysis_all.csv')

In [3]:
df = df_sentiment.copy()

In [5]:
# Filter speakers with at least 2000 sentences
speaker_counts = df['Name'].value_counts()
selected_speakers = speaker_counts[speaker_counts >= 2000].index
filtered_df = df[df['Name'].isin(selected_speakers)]

In [7]:
selected_speakers.shape

(722,)

In [8]:
# Combine text for each speaker
speaker_texts = filtered_df.groupby('Name')['Text_Spoken'].apply(' '.join)

In [9]:
# Example: Fine-tuning process (high-level)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Let us train GPT on our speakers

In [11]:
speaker_texts

Name
Agnieszka Brugger           Herr Präsident! Meine Damen und Herren! In kei...
Albert Rupprecht(Weiden)    Sehr geehrte Frau Präsidentin! Liebe Kolleginn...
Albrecht Glaser             Herr Präsident! Meine sehr verehrten Damen und...
Alexander Bonde             Herr Präsident! Verehrte Damen und Herren! Lie...
Alexander Dobrindt          Herr Präsident! Sehr geehrte Damen und Herren!...
                                                  ...                        
Wolfgang Wiehle             Sehr geehrter Herr Präsident! Kolleginnen und ...
Wolfgang Wieland            Frau Staatsekretärin, grüne Parlamentarier sin...
Wolfgang Zöller             Grüß Gott, Herr Präsident! Liebe Kolleginnen u...
Yvonne Magwas               Sehr geehrter Herr Präsident! Liebe Kolleginne...
Özcan Mutlu                 Sehr geehrter Herr Präsident! Liebe Kolleginne...
Name: Text_Spoken, Length: 722, dtype: object

In [12]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup

In [21]:
class SpeakerDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten()}

In [22]:
# Dataset preparation
dataset = SpeakerDataset(speaker_texts, tokenizer, max_length=512)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [24]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup

In [28]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# Set up optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

In [29]:
# Function for training one epoch
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(data_loader)

In [30]:
# Function for evaluating the model
def evaluate_model(model, data_loader, device):
    model = model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = input_ids.clone()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(data_loader)

In [31]:
# Training and validation
epochs = 4  # Choose the number of epochs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [34]:
from torch.utils.data import DataLoader, random_split

# Assuming you have a 'dataset' variable which is an instance of your custom Dataset class
# Split the dataset into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)  # No need to shuffle the validation set

In [None]:
# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss = evaluate_model(model, val_loader, device)
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss}, Val Loss: {val_loss}")

# Save the model
model.save_pretrained('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians')
tokenizer.save_pretrained('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians')

# Using the model

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians/model')
tokenizer = GPT2Tokenizer.from_pretrained('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians.tokenizer')

# Function to generate text
def generate_text(prompt, length=50):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones(input_ids.shape, device=input_ids.device)
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=length, num_return_sequences=1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
prompt = "Here is a sentence to start off"  # Replace with your own prompt
generated_text = generate_text(prompt)
print(generated_text)