In [5]:
#!/usr/bin/env python3
import duckdb
import glob
import os

def main():
    # Path to the top-level folder that contains part_1, part_2, etc.
    base_folder = "data/"  # <-- Change this to your actual folder
    
    # Connect to (or create) the DuckDB database file
    # This will create a file called "tweets.duckdb" in the current directory.
    # Adjust the path if you want it somewhere else.
    con = duckdb.connect("database/tweets.duckdb")
    
    # Create the table (if it doesn't exist).
    # Adjust data types as necessary for your use case:
    create_table_query = """
    CREATE TABLE IF NOT EXISTS tweets (
        id VARCHAR,
        text VARCHAR,
        url VARCHAR,
        epoch VARCHAR,
        media VARCHAR,
        retweetedTweet VARCHAR,
        retweetedTweetID VARCHAR,
        retweetedUserID VARCHAR,
        id_str VARCHAR,
        lang VARCHAR,
        rawContent VARCHAR,
        replyCount DOUBLE,
        retweetCount DOUBLE,
        likeCount DOUBLE,
        quoteCount DOUBLE,
        conversationId VARCHAR,
        conversationIdStr VARCHAR,
        hashtags VARCHAR,
        mentionedUsers VARCHAR,
        links VARCHAR,
        viewCount DOUBLE,
        quotedTweet VARCHAR,
        in_reply_to_screen_name VARCHAR,
        in_reply_to_status_id_str VARCHAR,
        in_reply_to_user_id_str VARCHAR,
        location VARCHAR,
        cash_app_handle VARCHAR,
        user VARCHAR,
        date VARCHAR,
        _type VARCHAR
    );
    """
    con.execute(create_table_query)

    # Loop through part_1, part_2, ... folders, grabbing *.csv.gz
    # For example, if you have:
    #   base_folder/part_1/something.csv.gz
    #   base_folder/part_2/another.csv.gz
    # etc.
    pattern = os.path.join(base_folder, "part_*", "*.csv.gz")
    file_list = sorted(glob.glob(pattern))
    
    for csv_gz_file in file_list:
        print(f"Ingesting {csv_gz_file} ...")
        # Insert data directly from CSV into the tweets table
        # DuckDB can handle compressed CSV if the extension ends in .gz
        # read_csv_auto(...) will infer schema automatically.
        # Because we already created the table with a certain schema,
        # we use ALL_VARCHAR=TRUE or align columns carefully.
        # If you want to rely entirely on the schema we created, set ALL_VARCHAR=TRUE
        # or ensure columns match 1-to-1 in the same order.
        
        insert_query = """
        INSERT INTO tweets
        SELECT
            NULLIF(id, '') AS id,
            NULLIF(text, '') AS text,
            NULLIF(url, '') AS url,
            NULLIF(epoch, '') AS epoch,
            NULLIF(media, '') AS media,
            NULLIF(retweetedTweet, '') AS retweetedTweet,
            NULLIF(retweetedTweetID, '') AS retweetedTweetID,
            NULLIF(retweetedUserID, '') AS retweetedUserID,
            NULLIF(id_str, '') AS id_str,
            NULLIF(lang, '') AS lang,
            NULLIF(rawContent, '') AS rawContent,
            COALESCE(TRY_CAST(NULLIF(replyCount, '') AS DOUBLE), 0) AS replyCount,
            COALESCE(TRY_CAST(NULLIF(retweetCount, '') AS DOUBLE), 0) AS retweetCount,
            COALESCE(TRY_CAST(NULLIF(likeCount, '') AS DOUBLE), 0) AS likeCount,
            COALESCE(TRY_CAST(NULLIF(quoteCount, '') AS DOUBLE), 0) AS quoteCount,
            NULLIF(conversationId, '') AS conversationId,
            NULLIF(conversationIdStr, '') AS conversationIdStr,
            NULLIF(hashtags, '') AS hashtags,
            NULLIF(mentionedUsers, '') AS mentionedUsers,
            NULLIF(links, '') AS links,
            COALESCE(TRY_CAST(NULLIF(regexp_extract(viewCount, 'count'': ''(\d+)', 1), '') AS DOUBLE), 0) AS viewCount,
            NULLIF(quotedTweet, '') AS quotedTweet,
            NULLIF(in_reply_to_screen_name, '') AS in_reply_to_screen_name,
            NULLIF(in_reply_to_status_id_str, '') AS in_reply_to_status_id_str,
            NULLIF(in_reply_to_user_id_str, '') AS in_reply_to_user_id_str,
            NULLIF(location, '') AS location,
            NULLIF(cash_app_handle, '') AS cash_app_handle,
            NULLIF(user, '') AS user,
            NULLIF(date, '') AS date,
            NULLIF(_type, '') AS _type
        FROM read_csv_auto(
            ?,
            header=TRUE,
            sample_size=-1,
            all_varchar=TRUE,
            ignore_errors=true
        )
        """
        
        con.execute(insert_query, [csv_gz_file])
    
    print("All CSV.gz files have been ingested successfully.")
    con.close()

if __name__ == "__main__":
    main()


Ingesting data/part_11/may_july_chunk_201.csv.gz ...
Ingesting data/part_11/may_july_chunk_202.csv.gz ...
Ingesting data/part_11/may_july_chunk_203.csv.gz ...
Ingesting data/part_11/may_july_chunk_204.csv.gz ...
Ingesting data/part_11/may_july_chunk_205.csv.gz ...
Ingesting data/part_11/may_july_chunk_206.csv.gz ...
Ingesting data/part_11/may_july_chunk_207.csv.gz ...
Ingesting data/part_11/may_july_chunk_208.csv.gz ...
Ingesting data/part_11/may_july_chunk_209.csv.gz ...
Ingesting data/part_11/may_july_chunk_210.csv.gz ...
Ingesting data/part_11/may_july_chunk_211.csv.gz ...
Ingesting data/part_11/may_july_chunk_212.csv.gz ...
Ingesting data/part_11/may_july_chunk_213.csv.gz ...
Ingesting data/part_11/may_july_chunk_214.csv.gz ...
Ingesting data/part_11/may_july_chunk_215.csv.gz ...
Ingesting data/part_11/may_july_chunk_216.csv.gz ...
Ingesting data/part_11/may_july_chunk_217.csv.gz ...
Ingesting data/part_11/may_july_chunk_218.csv.gz ...
Ingesting data/part_11/may_july_chunk_219.csv.

In [8]:
def analyze_database():
    # Connect to the existing database
    con = duckdb.connect("database/tweets.duckdb")
    
    # Get total row count
    total_rows = con.execute("SELECT COUNT(*) FROM tweets").fetchone()[0]
    print(f"\nTotal rows in database: {total_rows:,}")
    
    # Count NULL values for each column
    null_counts_query = """
    SELECT 
        COUNT(*) - COUNT(id) as id_nulls,
        COUNT(*) - COUNT(text) as text_nulls,
        COUNT(*) - COUNT(url) as url_nulls,
        COUNT(*) - COUNT(epoch) as epoch_nulls,
        COUNT(*) - COUNT(media) as media_nulls,
        COUNT(*) - COUNT(retweetedTweet) as retweetedTweet_nulls,
        COUNT(*) - COUNT(lang) as lang_nulls,
        COUNT(*) - COUNT(rawContent) as rawContent_nulls,
        COUNT(*) - COUNT(NULLIF(replyCount, 0)) as replyCount_nulls,
        COUNT(*) - COUNT(NULLIF(retweetCount, 0)) as retweetCount_nulls,
        COUNT(*) - COUNT(NULLIF(likeCount, 0)) as likeCount_nulls,
        COUNT(*) - COUNT(NULLIF(quoteCount, 0)) as quoteCount_nulls,
        COUNT(*) - COUNT(NULLIF(viewCount, 0)) as viewCount_nulls,
        COUNT(*) - COUNT(location) as location_nulls,
        COUNT(*) - COUNT(user) as user_nulls,
        COUNT(*) - COUNT(date) as date_nulls
    FROM tweets
    """
    null_counts = con.execute(null_counts_query).fetchone()
    
    print("\nNull/Empty value counts:")
    column_names = ['id', 'text', 'url', 'epoch', 'media', 'retweetedTweet', 'lang', 
                   'rawContent', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
                   'viewCount', 'location', 'user', 'date']
    
    for col, count in zip(column_names, null_counts):
        percentage = (count / total_rows) * 100
        print(f"{col:<15} {count:>10,} ({percentage:>5.1f}%)")
    
    # Basic statistics for numeric columns
    stats_query = """
    SELECT 
        'Engagement Metrics' as metric,
        AVG(replyCount) as avg_replies,
        MAX(replyCount) as max_replies,
        AVG(retweetCount) as avg_retweets,
        MAX(retweetCount) as max_retweets,
        AVG(likeCount) as avg_likes,
        MAX(likeCount) as max_likes,
        AVG(viewCount) as avg_views,
        MAX(viewCount) as max_views
    FROM tweets
    """
    stats = con.execute(stats_query).fetchone()
    
    print("\nEngagement Statistics:")
    print(f"Replies:  Avg: {stats[1]:,.1f}, Max: {stats[2]:,}")
    print(f"Retweets: Avg: {stats[3]:,.1f}, Max: {stats[4]:,}")
    print(f"Likes:    Avg: {stats[5]:,.1f}, Max: {stats[6]:,}")
    print(f"Views:    Avg: {stats[7]:,.1f}, Max: {stats[8]:,}")
    
    # Display 5 random rows
    print("\nRandom Sample of Tweets:")
    sample_query = """
    SELECT 
        text,
        replyCount,
        retweetCount,
        likeCount,
        viewCount,
        date
    FROM tweets 
    WHERE text IS NOT NULL
    ORDER BY random() 
    LIMIT 5
    """
    samples = con.execute(sample_query).fetchall()
    
    for i, row in enumerate(samples, 1):
        print(f"\n--- Tweet {i} ---")
        print(f"Date: {row[5]}")
        print(f"Text: {row[0][:200]}...")
        print(f"Engagement: {row[1]} replies, {row[2]} retweets, {row[3]} likes, {row[4]} views")
    
    con.close()

# Run the analysis
if __name__ == "__main__":
    analyze_database()


Total rows in database: 20,000,016

Null/Empty value counts:
id                       1 (  0.0%)
text                    33 (  0.0%)
url                     70 (  0.0%)
epoch                   73 (  0.0%)
media                  100 (  0.0%)
retweetedTweet          48 (  0.0%)
lang                    95 (  0.0%)
rawContent             126 (  0.0%)
replyCount      15,983,111 ( 79.9%)
retweetCount    17,530,827 ( 87.7%)
likeCount       12,929,449 ( 64.6%)
quoteCount      19,300,042 ( 96.5%)
viewCount        2,300,627 ( 11.5%)
location        18,234,419 ( 91.2%)
user                   134 (  0.0%)
date            20,000,012 (100.0%)

Engagement Statistics:
Replies:  Avg: 180,081,764,899.9, Max: 1.8050027949417231e+18
Retweets: Avg: 452,974,972,781.4, Max: 1.8165403358264364e+18
Likes:    Avg: 137,135,678,736.6, Max: 1.814654783417926e+18
Views:    Avg: 1,556.6, Max: 439,665,677.0

Random Sample of Tweets:

--- Tweet 1 ---
Date: None
Text: El expresidente Donald Trump ha obtenido más de lo