In [1]:
import pandas as pd

In [21]:
truths = pd.read_csv('trump_truths_dataset.csv')
truths.rename(columns={'post_date': 'date', 'status_text': 'text'}, inplace=True)
truths['date'] = pd.to_datetime(truths['date'])
to_drop = ['video_urls/0', 'image_urls/0', 'video_urls/0/url',
       'video_urls/0/quality', 'video_urls/1/url', 'video_urls/1/quality',
       'video_urls/2/url', 'video_urls/2/quality', 'image_urls/1',
       'image_urls/2', 'image_urls/3', 'video_urls/3/url',
       'video_urls/3/quality', 'video_urls/4/url', 'video_urls/4/quality','profile_link', 'avatar_url', 'post_url',
       'account_name', 'account_handle', 'verified_badge', 'replies']
truths.drop(columns=to_drop, inplace=True)
truths.dropna(subset=['text'], inplace=True)
truths.head()

Unnamed: 0,date,text,shares,likes
0,2024-11-08 10:05:00,"There are fake, untrue, and probably illegal r...","1,41k","5,01k"
6,2024-11-05 22:00:00,California - 1 More Hour!Polls are open until ...,"5,38k","31,4k"
7,2024-11-05 20:58:00,Nevada - 1 More Hour!Polls are open until 7:00...,"4,18k","24,1k"
8,2024-11-05 20:58:00,Montana - 1 More Hour!Polls are open until 8:0...,"3,17k","18,3k"
9,2024-11-05 19:33:00,HI REPUBLICANS! IF YOU’RE IN LINE—STAY IN LINE…,"7,72k","37,5k"


In [25]:
tweets = pd.read_csv('trump_tweets.csv')
to_drop = ['id','isFlagged', 'device', 'isDeleted']
tweets.drop(columns=to_drop, inplace=True)
tweets.rename(columns={'favorites': 'likes', 'retweets': 'shares'}, inplace=True)
tweets['date'] = pd.to_datetime(tweets['date'])
#drop retweets
tweets = tweets[tweets['isRetweet'] == 'f']
tweets.drop(columns=['isRetweet'], inplace=True)
tweets.head()

Unnamed: 0,text,likes,shares,date
0,Republicans and Democrats have both created ou...,49,255,2011-08-02 18:07:48
1,I was thrilled to be back in the Great city of...,73748,17404,2020-03-03 01:34:50
3,The Unsolicited Mail In Ballot Scam is a major...,80527,23502,2020-09-12 20:10:58
6,Getting a little exercise this morning! https:...,285863,30209,2020-02-01 16:14:02
7,https://t.co/4qwCKQOiOw,130822,19127,2020-10-23 04:52:14


In [26]:
# combine the two datasets
combined = pd.concat([truths, tweets], ignore_index=True)
combined

Unnamed: 0,date,text,shares,likes
0,2024-11-08 10:05:00,"There are fake, untrue, and probably illegal r...","1,41k","5,01k"
1,2024-11-05 22:00:00,California - 1 More Hour!Polls are open until ...,"5,38k","31,4k"
2,2024-11-05 20:58:00,Nevada - 1 More Hour!Polls are open until 7:00...,"4,18k","24,1k"
3,2024-11-05 20:58:00,Montana - 1 More Hour!Polls are open until 8:0...,"3,17k","18,3k"
4,2024-11-05 19:33:00,HI REPUBLICANS! IF YOU’RE IN LINE—STAY IN LINE…,"7,72k","37,5k"
...,...,...,...,...
50929,2020-01-03 12:44:30,"Iran never won a war, but never lost a negotia...",57253,303007
50930,2020-01-01 01:03:15,Thank you to the @dcexaminer Washington Examin...,9213,35044
50931,2020-01-01 00:55:01,One of my greatest honors was to have gotten C...,12761,56731
50932,2020-10-22 21:04:21,Just signed an order to support the workers of...,36001,176289


In [27]:
import re
from itertools import chain
from tqdm.auto import tqdm
from transformers import pipeline
from datasets import Dataset  # HuggingFace dataset
import numpy as np # For potential use later, good to have

In [28]:
# Helper functions and entity definitions (similar to general_users.ipynb)

def generate_aliases(person):
    first, last = person["first"], person["last"]
    full = f"{first} {last}"
    aliases = [full, first, last]
    if "nicknames" in person:
        aliases.extend(person["nicknames"])
    if "handle" in person:
        aliases.extend([person["handle"], f"@{person['handle']}"])
    return aliases

# People and slogans - can be adjusted if needed for Trump's context, but using general ones for now
dem_people = [
    {"first": "Joe", "last": "Biden", "nicknames": ["JoeBiden"], "handle": "JoeBiden"},
    {"first": "Kamala", "last": "Harris", "nicknames": ["KamalaHarris"], "handle": "KamalaHarris"},
]
rep_people = [
    {"first": "Donald", "last": "Trump", "nicknames": ["Trump"], "handle": "realDonaldTrump"},
    # Add other relevant figures if analyzing broader Republican sentiment, otherwise Trump alone is fine
]

# Entities
dem_entities = list(chain.from_iterable(generate_aliases(p) for p in dem_people)) + [
    "democrat", "democrats", "Democrats", "democratic party", "dnc", "vote blue", "blue wave", "bidenomics"
]
rep_entities = list(chain.from_iterable(generate_aliases(p) for p in rep_people)) + [
    "republican", "Republican", "republicans", "gop", "rnc", "maga", "trump2024", "drain the swamp"
]

# Regex patterns
def compile_pattern(entities):
    sorted_ents = sorted(set(entities), key=len, reverse=True)
    pat = r'\b(' + '|'.join(re.escape(ent) for ent in sorted_ents) + r')\b'
    return re.compile(pat, flags=re.IGNORECASE)

dem_pattern = compile_pattern(dem_entities)
rep_pattern = compile_pattern(rep_entities)

def truncate_sent(text, max_len=512):
    if not isinstance(text, str):
        return ""  # or return a placeholder like "[no text]"
    return text if len(text) <= max_len else text[:max_len]

tqdm.pandas()

In [30]:
# Ensure 'text' column is string type and handle NaNs
combined['text'] = combined['text'].astype(str).fillna('')

# Create HuggingFace Dataset
ds_combined = Dataset.from_pandas(combined[['text']])

# Initialize sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0)  # if GPU available
# Truncate texts
truncated_texts_combined = [truncate_sent(t) for t in ds_combined['text']]

# Run sentiment analysis in batch
sentiments_combined = sentiment_analyzer(truncated_texts_combined, batch_size=32) # Adjust batch_size based on your VRAM

# Add sentiment results to the Dataset
ds_combined = ds_combined.add_column("Sentiment", sentiments_combined)

Device set to use mps:0


In [31]:
# Convert Dataset back to pandas DataFrame
df_s_combined = ds_combined.to_pandas()

# Add Score and Label to the original combined DataFrame
combined['Score'] = df_s_combined['Sentiment'].apply(lambda x: x['score'])
combined['Label'] = df_s_combined['Sentiment'].apply(lambda x: x['label'])

combined.head()

Unnamed: 0,date,text,shares,likes,Score,Label
0,2024-11-08 10:05:00,"There are fake, untrue, and probably illegal r...","1,41k","5,01k",0.99614,POSITIVE
1,2024-11-05 22:00:00,California - 1 More Hour!Polls are open until ...,"5,38k","31,4k",0.9965,NEGATIVE
2,2024-11-05 20:58:00,Nevada - 1 More Hour!Polls are open until 7:00...,"4,18k","24,1k",0.996823,NEGATIVE
3,2024-11-05 20:58:00,Montana - 1 More Hour!Polls are open until 8:0...,"3,17k","18,3k",0.996348,NEGATIVE
4,2024-11-05 19:33:00,HI REPUBLICANS! IF YOU’RE IN LINE—STAY IN LINE…,"7,72k","37,5k",0.99402,POSITIVE


In [33]:
# Check the distribution of sentiments
print(combined['Label'].value_counts())

# Display some examples
print("\nPositive examples:")
print(combined[combined['Label'] == 'POSITIVE'][['text', 'Score']].tail())

print("\nNegative examples:")
print(combined[combined['Label'] == 'NEGATIVE'][['text', 'Score']].tail())

Label
POSITIVE    25948
NEGATIVE    24986
Name: count, dtype: int64

Positive examples:
                                                    text     Score
50918  MEXICO IS PAYING FOR THE WALL through the many...  0.997372
50920  Iran is talking very boldly about targeting ce...  0.995202
50925  95% Approval Rating in the Republican Party. T...  0.999859
50929  Iran never won a war, but never lost a negotia...  0.997669
50930  Thank you to the @dcexaminer Washington Examin...  0.995695

Negative examples:
                                                    text     Score
50927  ....followed, and then it was withdrawn. The D...  0.999376
50928  ....different places that he thought were wast...  0.991819
50931  One of my greatest honors was to have gotten C...  0.987256
50932  Just signed an order to support the workers of...  0.995773
50933  Suburban women want Safety &amp; Security. Joe...  0.993172
