In [17]:
import duckdb
import glob
import os

def main():
    # Path to the top-level folder that contains part_1, part_2, etc.
    base_folder = "/Volumes/T7/tweets_data/usc-x-24-us-election"  # Adjust as needed
    
    # Connect to (or create) the DuckDB database file
    con = duckdb.connect("/Volumes/T7/tweets.duckdb")
    
    # Create the table (if it doesn't exist).
    create_table_query = """
    CREATE TABLE IF NOT EXISTS tweets (
        id VARCHAR,
        text VARCHAR,
        url VARCHAR,
        epoch VARCHAR,
        media VARCHAR,
        retweetedTweet VARCHAR,
        retweetedTweetID VARCHAR,
        retweetedUserID VARCHAR,
        id_str VARCHAR,
        lang VARCHAR,
        rawContent VARCHAR,
        replyCount DOUBLE,
        retweetCount DOUBLE,
        likeCount DOUBLE,
        quoteCount DOUBLE,
        conversationId VARCHAR,
        conversationIdStr VARCHAR,
        hashtags VARCHAR,
        mentionedUsers VARCHAR,
        links VARCHAR,
        viewCount DOUBLE,
        quotedTweet VARCHAR,
        in_reply_to_screen_name VARCHAR,
        in_reply_to_status_id_str VARCHAR,
        in_reply_to_user_id_str VARCHAR,
        location VARCHAR,
        cash_app_handle VARCHAR,
        user VARCHAR,
        date VARCHAR,
        _type VARCHAR
    );
    """
    con.execute(create_table_query)

    # Grab all *.csv.gz files in any "part_*" subfolder
    pattern = os.path.join(base_folder, "part_*", "*.csv.gz")
    file_list = sorted(glob.glob(pattern))

    # Single query that:
    #  (1) Reads all CSVs at once with union_by_name=TRUE
    #  (2) SELECTs exactly the columns that match your table
    #  (3) Coalesces type → _type, username → user
    #  (4) Ignores any "0" column by never selecting it
    #  (5) date will be NULL if missing
    #  (6) Cast numeric columns (replyCount, retweetCount, etc.) to DOUBLE
    insert_query = f"""
    INSERT INTO tweets
    SELECT
        id,
        text,
        url,
        epoch,
        media,
        retweetedTweet,
        retweetedTweetID,
        retweetedUserID,
        id_str,
        lang,
        rawContent,
        CAST(replyCount AS DOUBLE) AS replyCount,
        CAST(retweetCount AS DOUBLE) AS retweetCount,
        CAST(likeCount AS DOUBLE) AS likeCount,
        CAST(quoteCount AS DOUBLE) AS quoteCount,
        conversationId,
        conversationIdStr,
        hashtags,
        mentionedUsers,
        links,
        CAST(viewCount AS DOUBLE) AS viewCount,
        quotedTweet,
        in_reply_to_screen_name,
        in_reply_to_status_id_str,
        in_reply_to_user_id_str,
        location,
        cash_app_handle,
        COALESCE(user, username)      AS user,
        /* 'date' automatically NULL if missing */
        date,
        /* unify 'type' -> '_type' */
        COALESCE(_type, "type")       AS _type
    FROM read_csv_auto(
        {file_list},            -- pass in the list of files
        header=TRUE,
        sample_size=-1,
        ignore_errors=TRUE,
        union_by_name=TRUE
    )
    """
    
    print("Ingesting all CSV.gz files in a single pass...")
    con.execute(insert_query)

    print("All CSV.gz files have been ingested successfully.")
    con.close()

if __name__ == "__main__":
    main()


FatalException: FATAL Error: Failed: database has been invalidated because of a previous fatal error. The database must be restarted prior to being used again.
Original error: "Trying to seek a compressed CSV File."

In [14]:
def main():
    base_folder = "/Volumes/T7/tweets_data/usc-x-24-us-election"
    con = duckdb.connect("/Volumes/T7/tweets.duckdb")
    
    # Get all CSV files recursively
    all_files = glob.glob(os.path.join(base_folder, "**/*.csv.gz"), recursive=True)
    
    # Modified query to read all columns as VARCHAR
    inspect_query = """
    SELECT * FROM read_csv_auto(?, 
        header=TRUE, 
        sample_size=1,
        all_varchar=TRUE)
    """
    
    reference_columns = set(col[0] for col in con.execute(inspect_query, [all_files[0]]).description)
    
    print(f"Reference columns from {os.path.basename(all_files[0])}:")
    print(reference_columns)
    print("\nValidating all files...")
    
    mismatched_files = []
    for file_path in all_files[1:]:
        try:
            current_columns = set(col[0] for col in con.execute(inspect_query, [file_path]).description)
            
            if current_columns != reference_columns:
                print(f"\nMismatch found in {os.path.basename(file_path)}:")
                print("Missing columns:", reference_columns - current_columns)
                print("Extra columns:", current_columns - reference_columns)
                mismatched_files.append(file_path)
        except Exception as e:
            print(f"\nError processing file {os.path.basename(file_path)}:")
            print(str(e))
    
    print(f"\nValidation complete! Found {len(mismatched_files)} files with mismatched columns")
    con.close()

if __name__ == "__main__":
    main()

Reference columns from may_july_chunk_1.csv.gz:
{'media', 'date', 'viewCount', 'user', 'hashtags', 'in_reply_to_user_id_str', 'url', 'in_reply_to_status_id_str', 'replyCount', 'in_reply_to_screen_name', 'location', 'text', 'retweetedTweetID', 'id_str', 'id', 'links', 'conversationIdStr', 'quotedTweet', 'rawContent', 'quoteCount', 'mentionedUsers', 'lang', 'retweetCount', 'likeCount', 'epoch', 'retweetedTweet', 'conversationId', '_type', 'cash_app_handle', 'retweetedUserID'}

Validating all files...

Mismatch found in aug_chunk_1.csv.gz:
Missing columns: {'_type', 'date'}
Extra columns: {'type', '0', 'username'}

Mismatch found in aug_chunk_10.csv.gz:
Missing columns: {'_type', 'date'}
Extra columns: {'type', '0', 'username'}

Mismatch found in aug_chunk_11.csv.gz:
Missing columns: {'_type', 'date'}
Extra columns: {'type', '0', 'username'}

Mismatch found in aug_chunk_12.csv.gz:
Missing columns: {'_type', 'date'}
Extra columns: {'type', '0', 'username'}

Mismatch found in aug_chunk_13.c

In [3]:
def analyze_database():
    # Connect to the existing database
    con = duckdb.connect("nl2sql_agent/database/tweets.duckdb")
    
    # Get total row count
    total_rows = con.execute("SELECT COUNT(*) FROM tweets").fetchone()[0]
    print(f"\nTotal rows in database: {total_rows:,}")
    
    # Count NULL values for each column
    null_counts_query = """
    SELECT 
        COUNT(*) - COUNT(id) as id_nulls,
        COUNT(*) - COUNT(text) as text_nulls,
        COUNT(*) - COUNT(url) as url_nulls,
        COUNT(*) - COUNT(epoch) as epoch_nulls,
        COUNT(*) - COUNT(media) as media_nulls,
        COUNT(*) - COUNT(retweetedTweet) as retweetedTweet_nulls,
        COUNT(*) - COUNT(lang) as lang_nulls,
        COUNT(*) - COUNT(rawContent) as rawContent_nulls,
        COUNT(*) - COUNT(NULLIF(replyCount, 0)) as replyCount_nulls,
        COUNT(*) - COUNT(NULLIF(retweetCount, 0)) as retweetCount_nulls,
        COUNT(*) - COUNT(NULLIF(likeCount, 0)) as likeCount_nulls,
        COUNT(*) - COUNT(NULLIF(quoteCount, 0)) as quoteCount_nulls,
        COUNT(*) - COUNT(NULLIF(viewCount, 0)) as viewCount_nulls,
        COUNT(*) - COUNT(location) as location_nulls,
        COUNT(*) - COUNT(user) as user_nulls,
        COUNT(*) - COUNT(date) as date_nulls
    FROM tweets
    """
    null_counts = con.execute(null_counts_query).fetchone()
    
    print("\nNull/Empty value counts:")
    column_names = ['id', 'text', 'url', 'epoch', 'media', 'retweetedTweet', 'lang', 
                   'rawContent', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
                   'viewCount', 'location', 'user', 'date']
    
    for col, count in zip(column_names, null_counts):
        percentage = (count / total_rows) * 100
        print(f"{col:<15} {count:>10,} ({percentage:>5.1f}%)")
    
    # Basic statistics for numeric columns
    stats_query = """
    SELECT 
        'Engagement Metrics' as metric,
        AVG(replyCount) as avg_replies,
        MAX(replyCount) as max_replies,
        AVG(retweetCount) as avg_retweets,
        MAX(retweetCount) as max_retweets,
        AVG(likeCount) as avg_likes,
        MAX(likeCount) as max_likes,
        AVG(viewCount) as avg_views,
        MAX(viewCount) as max_views
    FROM tweets
    """
    stats = con.execute(stats_query).fetchone()
    
    print("\nEngagement Statistics:")
    print(f"Replies:  Avg: {stats[1]:,.1f}, Max: {stats[2]:,}")
    print(f"Retweets: Avg: {stats[3]:,.1f}, Max: {stats[4]:,}")
    print(f"Likes:    Avg: {stats[5]:,.1f}, Max: {stats[6]:,}")
    print(f"Views:    Avg: {stats[7]:,.1f}, Max: {stats[8]:,}")
    
    # Display 5 random rows
    print("\nRandom Sample of Tweets:")
    sample_query = """
    SELECT 
        text,
        replyCount,
        retweetCount,
        likeCount,
        viewCount,
        date
    FROM tweets 
    WHERE text IS NOT NULL
    ORDER BY random() 
    LIMIT 5
    """
    samples = con.execute(sample_query).fetchall()
    
    for i, row in enumerate(samples, 1):
        print(f"\n--- Tweet {i} ---")
        print(f"Date: {row[5]}")
        print(f"Text: {row[0][:200]}...")
        print(f"Engagement: {row[1]} replies, {row[2]} retweets, {row[3]} likes, {row[4]} views")
    
    con.close()

# Run the analysis
if __name__ == "__main__":
    analyze_database()


Total rows in database: 1,000,000

Null/Empty value counts:
id                       0 (  0.0%)
text                    12 (  0.0%)
url                     18 (  0.0%)
epoch                   18 (  0.0%)
media                   18 (  0.0%)
retweetedTweet           0 (  0.0%)
lang                    18 (  0.0%)
rawContent              24 (  0.0%)
replyCount         801,042 ( 80.1%)
retweetCount       883,074 ( 88.3%)
likeCount          646,745 ( 64.7%)
quoteCount         966,068 ( 96.6%)
viewCount           15,119 (  1.5%)
location         1,000,000 (100.0%)
user                    24 (  0.0%)
date             1,000,000 (100.0%)

Engagement Statistics:
Replies:  Avg: 2.7, Max: 22,556.0
Retweets: Avg: 4.5, Max: 20,435.0
Likes:    Avg: 19.3, Max: 140,381.0
Views:    Avg: 1,371.2, Max: 36,650,230.0

Random Sample of Tweets:

--- Tweet 1 ---
Date: None
Text: @ErrolWebber Shocking news from Israel! On October 7th, 2023, over 3000 Hamas terrorists stormed across its southern border, causing 

### Run the following script to download tweets from repo and extract them into a sql database

In [2]:
import duckdb
import glob
import os
import subprocess

def main():
    # --------------------------------------------------------------------
    # Step 1: Clone the GitHub repo if it doesn't already exist
    # --------------------------------------------------------------------
    repo_url = "https://github.com/sinking8/usc-x-24-us-election.git"
    repo_folder = "/Volumes/T7/tweets_data/usc-x-24-us-election"

    if not os.path.isdir(repo_folder):
        print(f"Cloning repo from {repo_url} into {repo_folder}...")
        subprocess.run(["git", "clone", repo_url], check=True)
    else:
        print(f"Repository folder '{repo_folder}' already exists. Skipping clone.")

    # --------------------------------------------------------------------
    # Step 2: Set base_folder to the cloned repo
    # --------------------------------------------------------------------
    base_folder = repo_folder  # i.e., "usc-x-24-us-election"

    # --------------------------------------------------------------------
    # Step 3: Connect to (or create) the DuckDB database file
    # --------------------------------------------------------------------
    con = duckdb.connect("/Volumes/T7/tweets.duckdb")

    # --------------------------------------------------------------------
    # Step 4: Create the table if it does not exist
    # --------------------------------------------------------------------
    create_table_query = """
    CREATE TABLE IF NOT EXISTS tweets (
        id VARCHAR,
        text VARCHAR,
        url VARCHAR,
        epoch VARCHAR,
        media VARCHAR,
        retweetedTweet VARCHAR,
        retweetedTweetID VARCHAR,
        retweetedUserID VARCHAR,
        id_str VARCHAR,
        lang VARCHAR,
        rawContent VARCHAR,
        replyCount DOUBLE,
        retweetCount DOUBLE,
        likeCount DOUBLE,
        quoteCount DOUBLE,
        conversationId VARCHAR,
        conversationIdStr VARCHAR,
        hashtags VARCHAR,
        mentionedUsers VARCHAR,
        links VARCHAR,
        viewCount DOUBLE,
        quotedTweet VARCHAR,
        in_reply_to_screen_name VARCHAR,
        in_reply_to_status_id_str VARCHAR,
        in_reply_to_user_id_str VARCHAR,
        location VARCHAR,
        cash_app_handle VARCHAR,
        user VARCHAR,
        date VARCHAR,
        _type VARCHAR
    );
    """
    con.execute(create_table_query)

    # --------------------------------------------------------------------
    # Step 5: Loop through part_* folders in the cloned repo and ingest CSV.GZ files
    # --------------------------------------------------------------------
    pattern = os.path.join(base_folder, "part_*", "*.csv.gz")
    file_list = sorted(glob.glob(pattern))
    
    for csv_gz_file in file_list:
        print(f"Ingesting {csv_gz_file} ...")
        insert_query = """
        INSERT INTO tweets
        SELECT
            NULLIF(id, '') AS id,
            NULLIF(text, '') AS text,
            NULLIF(url, '') AS url,
            NULLIF(epoch, '') AS epoch,
            NULLIF(media, '') AS media,
            NULLIF(retweetedTweet, '') AS retweetedTweet,
            NULLIF(retweetedTweetID, '') AS retweetedTweetID,
            NULLIF(retweetedUserID, '') AS retweetedUserID,
            NULLIF(id_str, '') AS id_str,
            NULLIF(lang, '') AS lang,
            NULLIF(rawContent, '') AS rawContent,
            COALESCE(TRY_CAST(NULLIF(replyCount, '') AS DOUBLE), 0) AS replyCount,
            COALESCE(TRY_CAST(NULLIF(retweetCount, '') AS DOUBLE), 0) AS retweetCount,
            COALESCE(TRY_CAST(NULLIF(likeCount, '') AS DOUBLE), 0) AS likeCount,
            COALESCE(TRY_CAST(NULLIF(quoteCount, '') AS DOUBLE), 0) AS quoteCount,
            NULLIF(conversationId, '') AS conversationId,
            NULLIF(conversationIdStr, '') AS conversationIdStr,
            NULLIF(hashtags, '') AS hashtags,
            NULLIF(mentionedUsers, '') AS mentionedUsers,
            NULLIF(links, '') AS links,
            COALESCE(TRY_CAST(NULLIF(regexp_extract(viewCount, 'count'': ''(\d+)', 1), '') AS DOUBLE), 0) AS viewCount,
            NULLIF(quotedTweet, '') AS quotedTweet,
            NULLIF(in_reply_to_screen_name, '') AS in_reply_to_screen_name,
            NULLIF(in_reply_to_status_id_str, '') AS in_reply_to_status_id_str,
            NULLIF(in_reply_to_user_id_str, '') AS in_reply_to_user_id_str,
            NULLIF(location, '') AS location,
            NULLIF(cash_app_handle, '') AS cash_app_handle,
            NULLIF(user, '') AS user,
            NULLIF(date, '') AS date,
            NULLIF(_type, '') AS _type
        FROM read_csv_auto(
            ?,
            header=TRUE,
            sample_size=-1,
            all_varchar=TRUE,
            ignore_errors=TRUE
        )
        """
        
        con.execute(insert_query, [csv_gz_file])
    
    # --------------------------------------------------------------------
    # Step 6: Close the connection
    # --------------------------------------------------------------------
    print("All CSV.gz files have been ingested successfully.")
    con.close()

if __name__ == "__main__":
    main()


Repository folder '/Volumes/T7/tweets_data/usc-x-24-us-election' already exists. Skipping clone.
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_1.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_10.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_11.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_12.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_13.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_14.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_15.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_16.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_17.csv.gz ...
Ingesting /Volumes/T7/tweets_data/usc-x-24-us-election/part_1/may_july_chunk_18.csv.g

BinderException: Binder Error: Column "date" referenced that exists in the SELECT clause - but this column cannot be referenced before it is defined

In [2]:
import pandas as pd

def preview_csv():
    file_path = "/Volumes/T7/tweets_data/usc-x-24-us-election/part_23/aug_chunk_1.csv.gz"
    
    try:
        # Read the gzipped CSV file
        df = pd.read_csv(file_path, compression='gzip', nrows=5)
        
        # Display the first 5 rows
        print("\nFirst 5 rows of the dataset:")
        print(df)
        
        # Display column names
        print("\nColumn names:")
        print(df.columns.tolist())
        
    except Exception as e:
        print(f"Error reading file: {e}")

if __name__ == "__main__":
    preview_csv()


First 5 rows of the dataset:
     type            id         username  \
0  tweet-  1.842096e+18  Debbie133467421   
1  tweet-  1.842096e+18       EMGENT_007   
2  tweet-  1.842096e+18     wellyworldfl   
3  tweet-  1.842096e+18         emi79907   
4  tweet-  1.842096e+18  hope_neverLost1   

                                                text  \
0  Kamala Harris is very connected to Diddy. She ...   
1                             @iamnot_elon 100% Sir!   
2  @catturd2 And all the insurance companies will...   
3                    @CatholicQuote12 BEAUTIFUL 🙏🙏🙏🙏   
4  9 Jahre Haft. #TinaPeters hatte ein privilegie...   

                                                 url         epoch media  \
0  https://twitter.com/Debbie133467421/status/184...  1.728025e+09    []   
1  https://twitter.com/EMGENT_007/status/18420955...  1.728025e+09    []   
2  https://twitter.com/wellyworldfl/status/184209...  1.728025e+09    []   
3  https://twitter.com/emi79907/status/1842095589...  1.728025e+