## Program: preprocess tweet data

In [None]:
import glob
import pandas as pd
from datetime import datetime
import datasets
import os
import pickle
import re
import fasttext
# Fasttext references:
#[1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, Bag of Tricks for Efficient Text Classification
#[2] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, FastText.zip: Compressing text classification models
import csv

In [None]:
def keep_valid_tweets(df):
    valid_tweets = df[
        df["Tweet"].notna()  # No NaN or None
        & df["Tweet"].apply(lambda x: isinstance(x, str))  # It's a string
        & df["Tweet"].apply(lambda x: not (isinstance(x, str) and (x.isspace() or len(x) == 0)))  # No blank spaces or empty strings
    ]
    valid_tweets.reset_index(drop=True, inplace=True)
    return valid_tweets

In [None]:
def keep_filtered_tweets(df, politics = None, date_ini = None, date_end = None, lang = None):

    if politics is not None and len(politics) > 0:
        filter_politics = df.Tweet.str.contains(politics[0], case=False)
        for keyword in politics[1:]:
            filter_politics |= df.Tweet.str.contains(keyword, case=False)
        df = df[filter_politics].copy()
        df.reset_index(drop=True, inplace=True)

    if date_ini is not None:
        filter_date_ini = df.Date >= date_ini
        df = df[filter_date_ini].copy()
        df.reset_index(drop=True, inplace=True)
    
    if date_end is not None:
        filter_date_end = df.Date <= date_end
        df = df[filter_date_end].copy()
        df.reset_index(drop=True, inplace=True)

    if lang is not None:
        model_path = r"lid.176.bin"  # Downloaded model that is better for tweets (Facebook's model)
        model = fasttext.load_model(model_path)
        clean_list = [re.sub(r"RT @\w+", "", s.replace("\n", "")) for s in list(df["Tweet"])] # the model does not accept new lines
        language_lists = model.predict(clean_list, k=1)[0]
        filter_english = [sublist == ['__label__en'] for sublist in language_lists] # English tweets
        print("Not English tweets: ", len(filter_english) - sum(filter_english))
        # print(df[~pd.Series(filter_english)])
        df = df[filter_english].copy()
        df.reset_index(drop=True, inplace=True)
    
    return df

In [None]:
dir = "../fase1_descargadatos/combined_table_users_tweets.csv"
df_24 = pd.read_csv(dir)

# convert Date column to datetime
df_24['Date'] = pd.to_datetime(df_24['Date'])
print(df_24.shape)
df_24.head()

In [None]:
df_24 = keep_valid_tweets(df_24)
df_24.shape

In [None]:
df_24 = keep_filtered_tweets(df_24, politics = ["@realDonaldTrump", "@KamalaHarris", "@JoeBiden"], date_ini = pd.to_datetime("2024-10-29"), date_end = pd.to_datetime("2024-11-05"), lang = True) # we have dates from the 28th but we only want 8 days like the rest of the years
df_24.shape

In [None]:
dir_16 = "../fase1_descargadatos/usa1620/tweets16_common.csv"
dir_20 = "../fase1_descargadatos/usa1620/tweets20_common.csv"

df_16_raw = pd.read_csv(dir_16)
df_20_raw = pd.read_csv(dir_20)

print(df_16_raw.shape)
print(df_20_raw.shape)

In [None]:
# only keep useful columns and rename them
df_16 = df_16_raw[["screen_name", "text", "created_at"]].copy()
df_16.rename(columns={"screen_name": "User", "text": "Tweet", "created_at": "Date"}, inplace=True)

df_20 = df_20_raw[["screen_name", "text", "created_at"]].copy()
df_20.rename(columns={"screen_name": "User", "text": "Tweet", "created_at": "Date"}, inplace=True)

# convert Date column to datetime
df_16['Date'] = pd.to_datetime(df_16['Date'].str[:10])
df_20['Date'] = pd.to_datetime(df_20['Date'].str[:10])

# keep valid tweets
df_16 = keep_valid_tweets(df_16)
df_20 = keep_valid_tweets(df_20)

# keep filtered tweets (it wasn't done for all ej 2026 tw: 2815508 didnt have any politics keyword)
df_16 = keep_filtered_tweets(df_16, politics = ["@realDonaldTrump", "@HillaryClinton"], date_end = pd.to_datetime("2016-11-08"), lang = True) # tenemos desde el 1
df_20 = keep_filtered_tweets(df_20, politics = ["@realDonaldTrump", "@JoeBiden"], date_end = pd.to_datetime("2020-11-03"), lang = True) # tenemos desde el 27

# print
print(df_16.shape)
print(df_16.head())

print(df_20.shape)
print(df_20.head())

In [None]:
# Intersection of users

# Get unique users from each dataframe
users_16 = set(df_16["User"].unique())
users_20 = set(df_20["User"].unique())
users_24 = set(df_24["User"].unique())

# Calculate the intersection of users
user_intersection = users_24.intersection(users_16).intersection(users_20)

# we filter the dataframes
df_16_filtered = df_16[df_16["User"].isin(user_intersection)]
df_20_filtered = df_20[df_20["User"].isin(user_intersection)]
df_24_filtered = df_24[df_24["User"].isin(user_intersection)]

df_16_filtered.reset_index(drop=True, inplace=True)
df_20_filtered.reset_index(drop=True, inplace=True)
df_24_filtered.reset_index(drop=True, inplace=True)

print(df_16_filtered.shape)
print(df_20_filtered.shape)
print(df_24_filtered.shape)

In [None]:
# we save the three dataframes
df_16_filtered.to_csv("tweets16_filtered.csv", index=False)
df_20_filtered.to_csv("tweets20_filtered.csv", index=False)
df_24_filtered.to_csv("tweets24_filtered.csv", index=False)

In [None]:
# we look for the tweets that are repeated the most to classify those manually
repeated_tweets = []
repeated_tweets.append(df_16_filtered["Tweet"].value_counts().head(100))
repeated_tweets.append(df_20_filtered["Tweet"].value_counts().head(100))
repeated_tweets.append(df_24_filtered["Tweet"].value_counts().head(100))

In [None]:
###

In [None]:
# If we keep the most repeated tweets, we would have 300 tweets to classify manually and this would represent a greater amount of the total tweets
# let's see many tweets we would be classifying if we classified these 300 tweets

df_16_filtered["Tweet"].value_counts().head(100).sum() + df_20_filtered["Tweet"].value_counts().head(100).sum() + df_24_filtered["Tweet"].value_counts().head(100).sum()

In [None]:
df_24_filtered["Tweet"].value_counts().head(300).sum() 

In [None]:
with open("repeated_tweets.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    for item in repeated_tweets:
        writer.writerow([item])  # Writing each item in a new row