## Program: continue processing tweet data

In [4]:
import glob
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import transformers
from transformers import pipeline
import datasets
import torch
import os
import pickle
from collections import Counter
import re
import emoji
import json

In [2]:
dir_16 = "../fase2_procesadodatos/tweets16_filtered.csv"
dir_20 = "../fase2_procesadodatos/tweets20_filtered.csv"
dir_24 = "../fase2_procesadodatos/tweets24_filtered.csv"

df_16_filtered = pd.read_csv(dir_16)
df_16_filtered['Date'] = pd.to_datetime(df_16_filtered['Date'])
print(df_16_filtered.shape)

df_20_filtered = pd.read_csv(dir_20)
df_20_filtered['Date'] = pd.to_datetime(df_20_filtered['Date'])
print(df_20_filtered.shape)

df_24_filtered = pd.read_csv(dir_24)
df_24_filtered['Date'] = pd.to_datetime(df_24_filtered['Date'])
print(df_24_filtered.shape)

(46915, 3)
(23716, 3)
(12333, 3)


In [3]:
def filter_candidates(df, contain = None, not_contain = None):
    if contain is not None:
        filter_contain = df.Tweet.str.contains(contain[0], case=False)
        for keyword in contain[1:]:
            filter_contain |= df.Tweet.str.contains(keyword, case=False) # OR (we want at least one of the keywords)
        df = df[filter_contain].copy()
        df.reset_index(drop=True, inplace=True)

    if not_contain is not None:
        filter_not_contain = ~df.Tweet.str.contains(not_contain[0], case=False)
        for keyword in not_contain[1:]:
            filter_not_contain &= ~df.Tweet.str.contains(keyword, case=False) # AND (we want none of the keywords)
        df = df[filter_not_contain].copy()
        df.reset_index(drop=True, inplace=True)

    return df

In [4]:
df_16_dem = filter_candidates(df_16_filtered, contain = ['@HillaryClinton'], not_contain = ['@realDonaldTrump',"Donald", "Trump"])
df_16_rep = filter_candidates(df_16_filtered, contain = ['@realDonaldTrump'], not_contain = ['@HillaryClinton',"Hillary","Clinton"])

df_20_dem = filter_candidates(df_20_filtered, contain = ['@JoeBiden'], not_contain = ['@realDonaldTrump',"Donald", "Trump"])
df_20_rep = filter_candidates(df_20_filtered, contain = ['@realDonaldTrump'], not_contain = ['@JoeBiden',"Joe","Biden"])

df_24_dem = filter_candidates(df_24_filtered, contain =['@KamalaHarris','@JoeBiden'], not_contain = ['@realDonaldTrump',"Donald", "Trump"])
df_24_rep = filter_candidates(df_24_filtered, contain =['@realDonaldTrump'], not_contain = ['@KamalaHarris','@JoeBiden',"Kamala","Harris","Joe","Biden"])

In [5]:
# Intersection of users

# Get unique users from each dataframe
users_16 = set(df_16_dem["User"].unique()) | set(df_16_rep["User"].unique())
users_20 = set(df_20_dem["User"].unique()) | set(df_20_rep["User"].unique())
users_24 = set(df_24_dem["User"].unique()) | set(df_24_rep["User"].unique())

# Calculate the intersection of users
user_intersection = users_24 & users_16 & users_20

# we filter the dataframes
df_16_dem = df_16_dem[df_16_dem["User"].isin(user_intersection)].reset_index(drop=True)
df_16_rep = df_16_rep[df_16_rep["User"].isin(user_intersection)].reset_index(drop=True)
df_20_dem = df_20_dem[df_20_dem["User"].isin(user_intersection)].reset_index(drop=True)
df_20_rep = df_20_rep[df_20_rep["User"].isin(user_intersection)].reset_index(drop=True)
df_24_dem = df_24_dem[df_24_dem["User"].isin(user_intersection)].reset_index(drop=True)
df_24_rep = df_24_rep[df_24_rep["User"].isin(user_intersection)].reset_index(drop=True)

In [6]:
df_16_dem.shape, df_16_rep.shape, df_20_dem.shape, df_20_rep.shape, df_24_dem.shape, df_24_rep.shape

((15102, 3), (20987, 3), (4940, 3), (13166, 3), (5878, 3), (3537, 3))

In [None]:
# we save them to csv 
df_16_dem.to_csv("tweets/tweets16_dem.csv", index=False)
df_16_rep.to_csv("tweets/tweets16_rep.csv", index=False)
df_20_dem.to_csv("tweets/tweets20_dem.csv", index=False)
df_20_rep.to_csv("tweets/tweets20_rep.csv", index=False)
df_24_dem.to_csv("tweets/tweets24_dem.csv", index=False)
df_24_rep.to_csv("tweets/tweets24_rep.csv", index=False)

### Descriptive statistics 
##### How has the format of tweets evolved over time?

- Number of emojis
- Number of hashtags
- Number of mentions
- Number of urls

In [7]:
# Function to count real emojis in a tweet
def count_real_emojis(text):
    return len([char for char in text if char in emoji.EMOJI_DATA])

# Function to count emoji descriptions (assuming words like "face with heart-shaped eyes" appear in text)
def count_text_emojis(text):
    # We manually look for the text codification that twitter uses for some of the most popular emojis
    # We add the rest by using a JSON file
    with open("emoji-dict.json", "r", encoding="utf-8") as f:
        data = json.load(f)

    # Extract names
    emoji_names = [entry["name"] for entry in data.values()] + ["Call me hand","Leftwards hand","Rightwards hand","Leftwards pushing hand","Leg","Nail polish","Selfie","Writing hand","Index pointing at the viewer","Reversed hand with middle finger extended","Mechanical arm","Flexed biceps","Rightwards pushing hand","Mechanical leg","Foot","Mouth","Biting lip","Tooth","Tongue","Ear","Ear with hearing aid","Nose","Crown","Red heart","Orange heart","Yellow heart","Green heart","Blue heart","Purple heart","Brown heart","Black heart","White heart","Heart with ribbon","Heart with arrow","Sparkling heart","Growing heart","Beating heart","Revolving hearts","Two hearts","Heart exclamation","Broken heart","Heart on fire","Mending heart","Heart decoration","No entry","Squared sos","Hundred points symbol","Red question mark ornament","White question mark ornament","Musical note","Multiple musical notes","Red circle","Orange circle","Yellow circle","Green circle","Blue circle","Purple circle","Ballot box with check","Brown circle","Black circle","White circle","Red square","Orange square","Yellow square","Green square","Blue square","Purple square","Brown square","Black large square","White large square","Black medium square","Speaker with one sound wave","Speaker with three sound waves","Speaker with cancellation stroke","Cheering megaphone","Public address loudspeaker","Bell","Bell with cancellation stroke","American football","Flag of U.S. Virgin Islands","Earth globe americas","Flag of United States","Grinning face","Smiling face with open mouth","Smiling face with open mouth and smiling eyes","Grinning face with smiling eyes","Smiling face with open mouth and cold sweat","Face with tears of joy","Rolling on the floor laughing","Smiling face with halo","Smiling face with sunglasses","Money-mouth face","Smiling face with open hands","Grimacing face","Pleading face","Face with look of triumph","Pouting face","Face with symbols over mouth","Sleepy face","Crying face","Loudly crying face","Hot face"]

    # Extend the regex pattern
    emoji_words_pattern = r'\b(?:' + '|'.join(emoji_names) + r')\b'

    # Find matches
    matches = re.findall(emoji_words_pattern, text, re.IGNORECASE)

    return len(matches)

In [None]:
# Calculate mean emoji count per tweet for each dataset
df_16_filtered["emoji_count"] = df_16_filtered["Tweet"].astype(str).apply(count_real_emojis)
df_20_filtered["emoji_count"] = df_20_filtered["Tweet"].astype(str).apply(count_real_emojis)
df_24_filtered["emoji_count"] = df_24_filtered["Tweet"].astype(str).apply(count_real_emojis) # in the actual project here we had to use count_text_emojis but now the web scraping code is fixed to encode emojis as real emojis again

mean_emoji_16 = df_16_filtered["emoji_count"].mean()
mean_emoji_20 = df_20_filtered["emoji_count"].mean()
mean_emoji_24 = df_24_filtered["emoji_count"].mean()

# Print results
print(f"Mean number of emojis per tweet in 2016: {mean_emoji_16:.2f}")
print(f"Mean number of emojis per tweet in 2020: {mean_emoji_20:.2f}")
print(f"Mean number of emojis per tweet in 2024: {mean_emoji_24:.2f}")

Mean number of emojis per tweet in 2016: 0.13
Mean number of emojis per tweet in 2020: 0.35
Mean number of emojis per tweet in 2024: 0.28


In [9]:
# Function to count hashtags followed by a letter (it's how hashtags are defined in Twitter)
def count_hashtags(text):
    return len(re.findall(r'#\b[a-zA-Z]\w*', text))

# Apply to each DataFrame
df_16_filtered["hashtag_count"] = df_16_filtered["Tweet"].astype(str).apply(count_hashtags)
df_20_filtered["hashtag_count"] = df_20_filtered["Tweet"].astype(str).apply(count_hashtags)
df_24_filtered["hashtag_count"] = df_24_filtered["Tweet"].astype(str).apply(count_hashtags)

# Compute mean hashtag count per tweet
mean_hashtag_16 = df_16_filtered["hashtag_count"].mean()
mean_hashtag_20 = df_20_filtered["hashtag_count"].mean()
mean_hashtag_24 = df_24_filtered["hashtag_count"].mean()

# Print results
print(f"Mean number of hashtags per tweet in 2016: {mean_hashtag_16:.2f}")
print(f"Mean number of hashtags per tweet in 2020: {mean_hashtag_20:.2f}")
print(f"Mean number of hashtags per tweet in 2024: {mean_hashtag_24:.2f}")

Mean number of hashtags per tweet in 2016: 0.58
Mean number of hashtags per tweet in 2020: 0.59
Mean number of hashtags per tweet in 2024: 0.32


In [10]:
# Function to count mentions (excluding "RT @user:" because in 2016 retweets were saved like that and we don't want to include them in the analysis)
def count_mentions(text, exclude_rt=False):
    if exclude_rt:
        text = re.sub(r'^RT @\w+: ', '', text)  # Remove "RT @user: " at the start
    return len(re.findall(r'@\w+', text))  # Count mentions (@ followed by letters, numbers, _)

# Apply to each DataFrame
df_16_filtered["mention_count"] = df_16_filtered["Tweet"].astype(str).apply(lambda x: count_mentions(x, exclude_rt=True))
df_20_filtered["mention_count"] = df_20_filtered["Tweet"].astype(str).apply(lambda x: count_mentions(x, exclude_rt=False))
df_24_filtered["mention_count"] = df_24_filtered["Tweet"].astype(str).apply(lambda x: count_mentions(x, exclude_rt=False))

# Compute mean mention count per tweet
mean_mentions_16 = df_16_filtered["mention_count"].mean()
mean_mentions_20 = df_20_filtered["mention_count"].mean()
mean_mentions_24 = df_24_filtered["mention_count"].mean()

# Print results
print(f"Mean number of mentions per tweet in 2016: {mean_mentions_16:.2f}")
print(f"Mean number of mentions per tweet in 2020: {mean_mentions_20:.2f}")
print(f"Mean number of mentions per tweet in 2024: {mean_mentions_24:.2f}")

Mean number of mentions per tweet in 2016: 1.89
Mean number of mentions per tweet in 2020: 1.97
Mean number of mentions per tweet in 2024: 1.79


In [11]:
# Function to count URLs starting with "https://"
def count_urls(text):
    return len(re.findall(r'https://\S+', text, re.IGNORECASE))

# Apply to each DataFrame
df_16_filtered["url_count"] = df_16_filtered["Tweet"].astype(str).apply(count_urls)
df_20_filtered["url_count"] = df_20_filtered["Tweet"].astype(str).apply(count_urls)
df_24_filtered["url_count"] = df_24_filtered["Tweet"].astype(str).apply(count_urls)

# Compute mean URL count per tweet
mean_url_16 = df_16_filtered["url_count"].mean()
mean_url_20 = df_20_filtered["url_count"].mean()
mean_url_24 = df_24_filtered["url_count"].mean()

# Print results
print(f"Mean number of URLs per tweet in 2016: {mean_url_16:.2f}")
print(f"Mean number of URLs per tweet in 2020: {mean_url_20:.2f}")
print(f"Mean number of URLs per tweet in 2024: {mean_url_24:.2f}")

Mean number of URLs per tweet in 2016: 0.43
Mean number of URLs per tweet in 2020: 0.48
Mean number of URLs per tweet in 2024: 0.03
