In [20]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer

# Load the dataset
df = pd.read_csv('../data/us_2020_election_speeches.csv')
biden = df[df['speaker'] == 'Joe Biden']
trump = df[df['speaker'] == 'Donald Trump']

print('Biden:', biden.shape[0])
print('Trump:', trump.shape[0])

biden_texts = biden['text'].tolist()
trump_texts = trump['text'].tolist()

# Initialize GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def tokenize_and_segment(texts, tokenizer, max_length=256, overlap=5):
    segments = []
    for text in texts:
        tokens = tokenizer(text)['input_ids']
        for i in range(0, len(tokens), max_length - overlap):
            segment = tokens[i:i + max_length]
            attention_mask = [1] * len(segment)
            if len(segment) < max_length:
                padding = [0] * (max_length - len(segment))
                segment += padding
                attention_mask += padding
            segments.append({"input_ids": segment, "attention_mask": attention_mask})
    return segments

# Tokenize and segment speeches
biden_segments = tokenize_and_segment(biden_texts, tokenizer)
trump_segments = tokenize_and_segment(trump_texts, tokenizer)

biden_dataset = Dataset.from_list(biden_segments)
trump_dataset = Dataset.from_list(trump_segments)

Biden: 71
Trump: 53


Token indices sequence length is longer than the specified maximum sequence length for this model (6052 > 1024). Running this sequence through the model will result in indexing errors


President Trump: (00:30)
Thank you. What a nice group. Thank you very much. Beautiful, thank you.
Crowd: (00:50)
Four more years.
President Trump: (00:52)
Thank you very much, please. We are going to be talking to our great senior citizens, that’s what I’m here for today.
Speaker 1: (00:55)
[crosstalk 00:00:50].
President Trump: (00:56)
We love our senior citizens. And I’m honored to be here in Fort Myers to reaffirm my solemn pledge to America’s seniors, it’s so important to me, I happen to be a senior. I will protect you, I will defend you, and I will fight for you with every ounce of energy and conviction that I have. You devoted your life to this country and I am devoting my life to you. My administration is working every day to give our amazing senior citizens the care, support, and respect that you deserve, and you understand that, we’ve worked together for a long time. As president, I’m deeply aware that America

m deeply aware that America’s 54 million seniors have borne the he

[tensor([ 340, 5692]), tensor([447,  13]), tensor([ 247, 1406]), tensor([ 82, 674]), tensor([  257, 10452]), tensor([ 369, 7320]), tensor([46859, 32675]), tensor([  13, 7557]), tensor([1318, 6971]), tensor([447, 290]), tensor([ 247, 4896]), tensor([ 82, 287]), tensor([257, 257]), tensor([1256, 1402]), tensor([  356, 11554]), tensor([714, 286]), tensor([1561, 3124]), tensor([ 546, 6095]), tensor([287, 284]), tensor([2846, 5068]), tensor([ 286, 1096]), tensor([262, 257]), tensor([23514,   649]), tensor([ 326, 3037]), tensor([3284,  329]), tensor([ 318, 1672]), tensor([1804,   13]), tensor([ 290, 1320]), tensor([ 407, 5419]), tensor([1804,  262]), tensor([  340, 11554]), tensor([ 13, 651]), tensor([ 887, 2067]), tensor([618,  13]), tensor([ 345, 3244]), tensor([ 466, 2839]), tensor([  11, 7713]), tensor([477,  11]), tensor([340, 356]), tensor([857, 760]), tensor([318,  11]), tensor([16637,  4003]), tensor([644, 262]), tensor([ 661, 6991]), tensor([892, 286]), tensor([743, 326]), tensor([ 

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [None]:
from glance import Corpus

biden_activations = Corpus('data/speeches-biden')
trump_activations = Corpus('data/speeches-trump')