# ETL process previous to the build of the live user interface

### Library, modules and token imports

In [1]:
import pandas as pd
from apify_client import ApifyClient
from tokens import APIFY_TOKEN

### Constants

In [2]:
APIFY_ACTOR_ID = '61RPP7dywgiy0JPD0'

TWEETS_COLUMNS_LIST = [
    "url",
    "createdAt",
    "id",
    "isReply",
    "inReplyToId",
    "isRetweet",
    "isQuote",
    "viewCount",
    "retweetCount",
    "likeCount",
    "replyCount",
    "lang",
    "author__createdAt",
    "author__location",
    "author__name",
    "author__id",
    "author__description",
    "author__followers",
    "author__verified",
    "text"
]

REMOVE_COLUMNS_COMMENTS = [
    "author__name",
    "author__id",
    "author__description",
]

INT_COLUMNS = [
    "viewCount",
    "retweetCount",
    "likeCount",
    "replyCount",
    "author__followers"
]

The columns have been pre-selected by the Data Collection team in order to source the Sentiment Analysis model exploration.

### Apify Client Instantiation

In [3]:
client = ApifyClient(APIFY_TOKEN)

### Functions to extract the information using Apify Client, and convert it to a Pandas dataframe, based on an original Tweet by a political actor.

In [4]:
def flatten_response(response):
    """ Returns a flat dictionary with unnested values """
    
    return {
        "url": response.get("url"),
        "createdAt": pd.to_datetime(response.get("createdAt")),
        "id": response.get("id"),
        "isReply": response.get("isReply"),
        "inReplyToId": response.get("inReplyToId", None), # Uses None if inReply is false
        "isRetweet": response.get("isRetweet"),
        "isQuote": response.get("isQuote"),
        "viewCount": response.get("viewCount"),
        "retweetCount": response.get("retweetCount"),
        "likeCount": response.get("likeCount"),
        "replyCount": response.get("replyCount"),
        "lang": response.get("lang"),
        "author__createdAt": pd.to_datetime(response["author"].get("createdAt")),
        "author__location": response["author"].get("location"),
        "author__name": response["author"].get("name"),
        "author__id": response["author"].get("id"),
        "author__description": response["author"].get("description"),
        "author__followers": response["author"].get("followers"),
        "author__verified": response["author"].get("isVerified"),
        "text": response.get("text")
    }

In [5]:
def main_tweet_dataframe(url):
    """ Given a tweet URL, returns a dataframe for it """

    # Input validation
    if 'x.com' not in url and 'twitter.com' not in url:
        return {'error': 'Input is not a tweet URL'}

    run_input = {
        "startUrls": [url],
    }

    run = client.actor(APIFY_ACTOR_ID).call(run_input=run_input)

    response = [dictionary for dictionary in client.dataset(run["defaultDatasetId"]).iterate_items()][0]

    flattened_data = flatten_response(response)

    # Convert the flattened dictionary to a DataFrame
    df = pd.DataFrame([flattened_data], columns=TWEETS_COLUMNS_LIST)

    # Convert columns to integers, handling None/NaN appropriately
    df[INT_COLUMNS] = df[INT_COLUMNS].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

    return df

In [6]:
def comments_dataframe(url):
    """ Given a tweet URL, returns a dataframe for the comments related to that tweet """

    # Input validation
    if 'x.com' not in url and 'twitter.com' not in url:
        return {'error': 'Input is not a tweet URL'}

    one_tweet_id = str(url.split('/')[-1])

    run_input_comment = {
        "conversationIds": [one_tweet_id],
        "maxItems": 300
    }

    run_comment = client.actor(APIFY_ACTOR_ID).call(run_input=run_input_comment)

    response_comment = [dictionary for dictionary in client.dataset(run_comment["defaultDatasetId"]).iterate_items()]

    flattened_responses = [flatten_response(response) for response in response_comment]

    # Keep only the selected columns
    include_columns = [column for column in TWEETS_COLUMNS_LIST if column not in REMOVE_COLUMNS_COMMENTS]

    # Convert the flattened dictionary to a DataFrame
    df =  pd.DataFrame(flattened_responses, columns=include_columns)

    # Convert columns to integers, handling None/NaN appropriately
    df[INT_COLUMNS] = df[INT_COLUMNS].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

    return df

### We are going to produce 5 Datasets, from different political actors, to be used as demo examples. Sentiment Analysis to be added later:

Sample Tweets

In [7]:
sample_tweets = {
    'nayib_bukele': 'https://x.com/nayibbukele/status/1788014378324754680',
    'walter_araujo': 'https://twitter.com/waraujo64/status/1782761311526391916',
    'marcelo_larin': 'https://twitter.com/MarceloLarin1/status/1774118316648599645',
    'gustavo_villatoro': 'https://x.com/Vi11atoro/status/1767373813526737223',
    'suecy_callejas_estrada': 'https://x.com/suecallejas/status/1786430963947335686'
}

Saving the tweets to parquet files

In [8]:
for actor, url in sample_tweets.items():
    main_df = main_tweet_dataframe(url)
    comments_df = comments_dataframe(url)
    main_file_name = f"{actor}_tweet.parquet"
    main_df.to_parquet(main_file_name, engine="pyarrow", use_deprecated_int96_timestamps=True)
    comments_file_name = f"{actor}_tweet_comments.parquet"
    comments_df.to_parquet(comments_file_name, engine="pyarrow", use_deprecated_int96_timestamps=True)