In [1]:
import regex as re
import pandas as pd 

In [2]:
def read_data(path):
    """
    Read the file from the given path as a Pandas dataframe
    
    :param path: the path to the original csv file
    """
    df = pd.read_csv(path, lineterminator='\n')
    return df

In [3]:
def clean_tweet(tweet, link=True, mention=True, hashtag=True):
    """
    Clean a tweet String based on given parameters:

    :param tweet: tweet String to be cleaned
    :param link: whether remove the links
    :param mention: whether remove the mention (@user_id)
    :param hashtag: whether remove the hashtag (#topic)
    :return: cleaned tweet String
    """
    re_pattern = []
    if link: re_pattern.append('http\S+')
    if mention: re_pattern.append('@\S+')
    if hashtag: re_pattern.append('#\S+')
    re_pattern = "|".join(re_pattern)

    tweet = str(tweet)
    tweet = re.sub(re_pattern, '', tweet)
    return tweet.strip()

In [4]:
def filter_tweets(df, min_tweet_count=20, min_word_count=0):
    """
    Filter out tweets with less than 'min_word_count' words
    and users with less than 'min_tweet_count' distinct tweets.

    :param df: DataFrame containing tweet data with columns 'tweet' and 'user_id'
    :param min_tweet_count: Minimum number of distinct tweets a user should have
    :param min_word_count: Minimum word count of a tweet
    :return: DataFrame with filtered users and their tweets
    """
    out = df[["tweet","user_id"]]

    # Drop N/A user id
    out = out.dropna(subset=['user_id'])

    # Remove URLs
    out['tweet'] = out['tweet'].astype(str)
    out["cleaned_tweets"] = out["tweet"].apply(clean_tweet)

    # Drop duplicate tweets from the same user
    out = out.drop_duplicates(subset=['user_id', 'cleaned_tweets'])

    # Remove tweets that are less than min_word_count words
    out = out[
        out['cleaned_tweets'].str.split().str.len() 
        >= min_word_count
        ]

    # Remove users who has less than min_tweet_count tweets
    user_count = out['user_id'].value_counts()
    valid_users = user_count[user_count >= min_tweet_count].index
    out = out[out['user_id'].isin(valid_users)]

    return out[['user_id', 'cleaned_tweets']]

In [5]:
trump_path = "data/hashtag_donaldtrump.csv"
biden_path = "data/hashtag_joebiden.csv"

trump = read_data(trump_path)
biden = read_data(biden_path)

print(trump.shape, biden.shape)

(970919, 21) (776886, 21)


In [6]:
trump_clean = filter_tweets(trump)
biden_clean = filter_tweets(biden)
print(trump_clean.shape, biden_clean.shape)

(287548, 2) (147997, 2)


In [7]:
trump_clean.to_csv('data/cleaned_tweets_biden.csv')
biden_clean.to_csv('data/cleaned_tweets_trump.csv')