## Program: continue processing tweet data

In [4]:
import glob
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import transformers
from transformers import pipeline
import datasets
import torch
import os
import pickle
from collections import Counter
import re
import emoji
import json

In [2]:
dir_16 = "../fase2_procesadodatos/tweets16_filtered.csv"
dir_20 = "../fase2_procesadodatos/tweets20_filtered.csv"
dir_24 = "../fase2_procesadodatos/tweets24_filtered.csv"

df_16_filtered = pd.read_csv(dir_16)
df_16_filtered['Date'] = pd.to_datetime(df_16_filtered['Date'])
print(df_16_filtered.shape)

df_20_filtered = pd.read_csv(dir_20)
df_20_filtered['Date'] = pd.to_datetime(df_20_filtered['Date'])
print(df_20_filtered.shape)

df_24_filtered = pd.read_csv(dir_24)
df_24_filtered['Date'] = pd.to_datetime(df_24_filtered['Date'])
print(df_24_filtered.shape)

(46915, 3)
(23716, 3)
(12333, 3)


In [3]:
def filter_candidates(df, contain = None, not_contain = None):
    if contain is not None:
        filter_contain = df.Tweet.str.contains(contain[0], case=False)
        for keyword in contain[1:]:
            filter_contain |= df.Tweet.str.contains(keyword, case=False) # OR (we want at least one of the keywords)
        df = df[filter_contain].copy()
        df.reset_index(drop=True, inplace=True)

    if not_contain is not None:
        filter_not_contain = ~df.Tweet.str.contains(not_contain[0], case=False)
        for keyword in not_contain[1:]:
            filter_not_contain &= ~df.Tweet.str.contains(keyword, case=False) # AND (we want none of the keywords)
        df = df[filter_not_contain].copy()
        df.reset_index(drop=True, inplace=True)

    return df

In [4]:
df_16_dem = filter_candidates(df_16_filtered, contain = ['@HillaryClinton'], not_contain = ['@realDonaldTrump',"Donald", "Trump"])
df_16_rep = filter_candidates(df_16_filtered, contain = ['@realDonaldTrump'], not_contain = ['@HillaryClinton',"Hillary","Clinton"])

df_20_dem = filter_candidates(df_20_filtered, contain = ['@JoeBiden'], not_contain = ['@realDonaldTrump',"Donald", "Trump"])
df_20_rep = filter_candidates(df_20_filtered, contain = ['@realDonaldTrump'], not_contain = ['@JoeBiden',"Joe","Biden"])

df_24_dem = filter_candidates(df_24_filtered, contain =['@KamalaHarris','@JoeBiden'], not_contain = ['@realDonaldTrump',"Donald", "Trump"])
df_24_rep = filter_candidates(df_24_filtered, contain =['@realDonaldTrump'], not_contain = ['@KamalaHarris','@JoeBiden',"Kamala","Harris","Joe","Biden"])

In [5]:
# Intersection of users

# Get unique users from each dataframe
users_16 = set(df_16_dem["User"].unique()) | set(df_16_rep["User"].unique())
users_20 = set(df_20_dem["User"].unique()) | set(df_20_rep["User"].unique())
users_24 = set(df_24_dem["User"].unique()) | set(df_24_rep["User"].unique())

# Calculate the intersection of users
user_intersection = users_24 & users_16 & users_20

# we filter the dataframes
df_16_dem = df_16_dem[df_16_dem["User"].isin(user_intersection)].reset_index(drop=True)
df_16_rep = df_16_rep[df_16_rep["User"].isin(user_intersection)].reset_index(drop=True)
df_20_dem = df_20_dem[df_20_dem["User"].isin(user_intersection)].reset_index(drop=True)
df_20_rep = df_20_rep[df_20_rep["User"].isin(user_intersection)].reset_index(drop=True)
df_24_dem = df_24_dem[df_24_dem["User"].isin(user_intersection)].reset_index(drop=True)
df_24_rep = df_24_rep[df_24_rep["User"].isin(user_intersection)].reset_index(drop=True)

In [6]:
df_16_dem.shape, df_16_rep.shape, df_20_dem.shape, df_20_rep.shape, df_24_dem.shape, df_24_rep.shape

((15102, 3), (20987, 3), (4940, 3), (13166, 3), (5878, 3), (3537, 3))

In [None]:
# we save them to csv 
df_16_dem.to_csv("tweets/tweets16_dem.csv", index=False)
df_16_rep.to_csv("tweets/tweets16_rep.csv", index=False)
df_20_dem.to_csv("tweets/tweets20_dem.csv", index=False)
df_20_rep.to_csv("tweets/tweets20_rep.csv", index=False)
df_24_dem.to_csv("tweets/tweets24_dem.csv", index=False)
df_24_rep.to_csv("tweets/tweets24_rep.csv", index=False)