## Sample from raw data for preliminary analysis

In [5]:
def sample_partition():
    import pandas as pd
    import glob
    import os

    # Set pandas display options for better formatting
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.float_format', lambda x: '%.3f' % x)
    
    
    # Get the first CSV.GZ file from part_1
    pattern = os.path.join("sample_data/part_1", "*.csv.gz")
    files = glob.glob(pattern)
    
    if not files:
        raise FileNotFoundError("No CSV.GZ files found in sample_data/part_1")
    
    # Take the first file found
    file_path = files[0]
    print(f"Sampling from: {file_path}")
    
    # Read 1000 random rows
    # Use chunksize for memory efficiency when dealing with large files
    df = pd.read_csv(file_path, compression='gzip', nrows=100000)
    
    return df

sample_df = sample_partition()
print(f"\nSample shape: {sample_df.shape}")
print("\nSample columns:")
print("- " + "\n- ".join(sample_df.columns.tolist()))
print("\nFirst few rows:")
print(sample_df.head().to_string())

Sampling from: sample_data/part_1/may_july_chunk_11.csv.gz

Sample shape: (50000, 30)

Sample columns:
- id
- text
- url
- epoch
- media
- retweetedTweet
- retweetedTweetID
- retweetedUserID
- id_str
- lang
- rawContent
- replyCount
- retweetCount
- likeCount
- quoteCount
- conversationId
- conversationIdStr
- hashtags
- mentionedUsers
- links
- viewCount
- quotedTweet
- in_reply_to_screen_name
- in_reply_to_status_id_str
- in_reply_to_user_id_str
- location
- cash_app_handle
- user
- date
- _type

First few rows:
                    id                                                                                                                                                                                                                                                                                       text                                                             url       epoch media  retweetedTweet  retweetedTweetID  retweetedUserID               id_str lang                

## Raw Chunk Analysis to ensure consistency of column names

In [13]:
def analyze_partition_consistency():
    """
    Analyze column consistency across all CSV.GZ files in part_x folders.
    Reports column name mismatches and ordering differences.
    """
    import pandas as pd
    import glob
    import os
    from tabulate import tabulate
    from collections import defaultdict

    base_path = "/Volumes/T7/tweets_data/usc-x-24-us-election"
    
    # Store column information for each file
    file_columns = {}
    all_columns = set()
    
    # Find all part folders
    part_folders = glob.glob(os.path.join(base_path, "part_*"))
    
    print("🔍 Scanning partitions...")
    
    # Collect column information from each file
    for folder in sorted(part_folders):
        files = glob.glob(os.path.join(folder, "*.csv.gz"))
        for file_path in files:
            try:
                # Read just the header
                df = pd.read_csv(file_path, compression='gzip', nrows=0)
                file_columns[file_path] = list(df.columns)
                all_columns.update(df.columns)
                print(f"✓ Processed: {os.path.basename(file_path)}")
            except Exception as e:
                print(f"❌ Error processing {file_path}: {str(e)}")
    
    # Analysis results
    print("\n📊 Column Consistency Analysis")
    print("=" * 80)
    
    # 1. Column Presence Matrix
    presence_data = []
    for file_path, columns in file_columns.items():
        row = {
            'File': os.path.basename(file_path),
            **{col: '✓' if col in columns else '❌' for col in sorted(all_columns)}
        }
        presence_data.append(row)
    
    print("\n🔍 Column Presence Matrix:")
    print(tabulate(presence_data, headers='keys', tablefmt='pretty', showindex=False))
    
    # 2. Column Order Analysis
    print("\n📋 Column Order Variations:")
    unique_orders = defaultdict(list)
    for file_path, columns in file_columns.items():
        order_key = tuple(columns)
        unique_orders[order_key].append(os.path.basename(file_path))
    
    for idx, (order, files) in enumerate(unique_orders.items(), 1):
        print(f"\nVariation {idx}:")
        print("Files:", ", ".join(files))
        print("Column Order:")
        for i, col in enumerate(order, 1):
            print(f"{i:2d}. {col}")
    
    # 3. Summary Statistics
    print("\n📈 Summary Statistics:")
    print(f"Total files analyzed: {len(file_columns)}")
    print(f"Total unique columns found: {len(all_columns)}")
    print(f"Number of different column orders: {len(unique_orders)}")
    
    # 4. Missing Columns Report
    print("\n❌ Missing Columns Report:")
    reference_columns = set(next(iter(file_columns.values())))
    for file_path, columns in file_columns.items():
        missing = reference_columns - set(columns)
        extra = set(columns) - reference_columns
        if missing or extra:
            print(f"\nFile: {os.path.basename(file_path)}")
            if missing:
                print("Missing:", ", ".join(missing))
            if extra:
                print("Extra:", ", ".join(extra))

# Run the analysis
analyze_partition_consistency()

🔍 Scanning partitions...
✓ Processed: may_july_chunk_1.csv.gz
✓ Processed: may_july_chunk_10.csv.gz
✓ Processed: may_july_chunk_11.csv.gz
✓ Processed: may_july_chunk_12.csv.gz
✓ Processed: may_july_chunk_13.csv.gz
✓ Processed: may_july_chunk_14.csv.gz
✓ Processed: may_july_chunk_15.csv.gz
✓ Processed: may_july_chunk_16.csv.gz
✓ Processed: may_july_chunk_17.csv.gz
✓ Processed: may_july_chunk_18.csv.gz
✓ Processed: may_july_chunk_19.csv.gz
✓ Processed: may_july_chunk_2.csv.gz
✓ Processed: may_july_chunk_20.csv.gz
✓ Processed: may_july_chunk_3.csv.gz
✓ Processed: may_july_chunk_4.csv.gz
✓ Processed: may_july_chunk_5.csv.gz
✓ Processed: may_july_chunk_6.csv.gz
✓ Processed: may_july_chunk_7.csv.gz
✓ Processed: may_july_chunk_8.csv.gz
✓ Processed: may_july_chunk_9.csv.gz
✓ Processed: may_july_chunk_181.csv.gz
✓ Processed: may_july_chunk_182.csv.gz
✓ Processed: may_july_chunk_183.csv.gz
✓ Processed: may_july_chunk_184.csv.gz
✓ Processed: may_july_chunk_185.csv.gz
✓ Processed: may_july_chunk_1

### Extract sample to a csv format for analysis 

In [7]:
import os 

def sample_and_save_partition(output_path="sample_data/sampled_tweets_b.csv"):
    """
    Sample 1000 rows from a partition and save to CSV.
    
    Args:
        output_path (str): Path where the sampled CSV will be saved
    """
    # Get the sample
    df = sample_partition()
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_path, index=False)
    print(f"\nSaved {len(df)} rows to: {output_path}")
    
    return df

# Example usage:
if __name__ == "__main__":
    df = sample_and_save_partition()

Sampling from: sample_data/part_1/may_july_chunk_11.csv.gz

Saved 50000 rows to: sample_data/sampled_tweets_b.csv


### Analyze Sampled CSV file

In [6]:
def analyze_columns(df):
    """
    Analyze each column in the dataframe and provide detailed information about data types,
    sample values, and basic statistics.
    
    Args:
        df (pd.DataFrame): Input dataframe to analyze
        
    Returns:
        dict: Dictionary containing analysis results for each column
    """
    analysis = {}
    
    for column in df.columns:
        # Initialize column analysis
        col_data = {
            'dtype': str(df[column].dtype),
            'null_count': df[column].isna().sum(),
            'null_percentage': (df[column].isna().sum() / len(df)) * 100,
            'unique_count': df[column].nunique(),
            'sample_values': df[column].dropna().head(3).tolist()
        }
        
        # Add type-specific analysis
        if df[column].dtype in ['int64', 'float64']:
            col_data.update({
                'min': df[column].min(),
                'max': df[column].max(),
                'mean': df[column].mean(),
                'median': df[column].median()
            })
        elif df[column].dtype == 'object':
            # For string/object columns
            non_null_values = df[column].dropna()
            if len(non_null_values) > 0:
                col_data.update({
                    'avg_length': non_null_values.str.len().mean(),
                    'max_length': non_null_values.str.len().max(),
                    'contains_numbers': any(str(x).replace('.','',1).isdigit() 
                                         for x in non_null_values.head(100)),
                    'most_common': df[column].value_counts().head(3).to_dict()
                })
        
        analysis[column] = col_data
    
    return analysis

def print_column_analysis(df):
    """Print a formatted analysis of all columns in the dataframe."""
    analysis = analyze_columns(df)
    
    print("\n📊 Column Analysis Report")
    print("=" * 100)
    
    for column, data in analysis.items():
        print(f"\n📌 Column: {column}")
        print("─" * 80)
        
        # Basic Info
        print(f"📋 Basic Information:")
        print(f"   • Type: {data['dtype']}")
        print(f"   • Null Values: {data['null_count']} ({data['null_percentage']:.1f}%)")
        print(f"   • Unique Values: {data['unique_count']}")
        print(f"   • Sample Values: {', '.join(str(x) for x in data['sample_values'])}")
        
        # Numeric Statistics
        if 'mean' in data:
            print(f"\n📈 Numeric Statistics:")
            print(f"   • Min: {data['min']:,.2f}")
            print(f"   • Max: {data['max']:,.2f}")
            print(f"   • Mean: {data['mean']:,.2f}")
            print(f"   • Median: {data['median']:,.2f}")
        
        # String Statistics
        if 'avg_length' in data:
            print(f"\n📝 String Statistics:")
            print(f"   • Average Length: {data['avg_length']:.1f}")
            print(f"   • Max Length: {data['max_length']}")
            print(f"   • Contains Numbers: {'✓' if data['contains_numbers'] else '✗'}")
            print(f"   • Most Common Values:")
            for value, count in data['most_common'].items():
                print(f"     - {value}: {count:,} occurrences")

# Example usage:
if __name__ == "__main__":
    df = sample_partition()
    print_column_analysis(df)

Sampling from: sample_data/part_1/may_july_chunk_11.csv.gz


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)



📊 Column Analysis Report

📌 Column: id
────────────────────────────────────────────────────────────────────────────────
📋 Basic Information:
   • Type: int64
   • Null Values: 0 (0.0%)
   • Unique Values: 50000
   • Sample Values: 1801321015319896315, 1801321014204178511, 1801321014040686814

📈 Numeric Statistics:
   • Min: 1,801,287,601,271,271,680.00
   • Max: 1,801,321,015,319,896,320.00
   • Mean: 1,801,304,750,555,860,992.00
   • Median: 1,801,305,189,049,210,880.00

📌 Column: text
────────────────────────────────────────────────────────────────────────────────
📋 Basic Information:
   • Type: object
   • Null Values: 0 (0.0%)
   • Unique Values: 49669
   • Sample Values: @BidenHQ Thank you, PM Meloni, for shepherding our "lost little POTUS" back to the herd.
I'm so sorry you and Macron, etc., have to provide nursing care Biden
This is REALLY not in your job description., @MuskUniverseUsa 9, @BidenHQ Lmfao. Biden has fallen down 17 times this year. You’re finding the one video whe

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


### Connect to and analyze tweets Database

In [7]:
def print_table_schema(db_path="tweets2.duckdb"):
    """
    Connect to tweets2.duckdb and print schema and column information.
    First lists available tables, then prints schema for each table.
    
    Args:
        db_path (str): Path to the DuckDB database file
    """
    try:
        # Connect to database
        with duckdb.connect(db_path) as con:
            # Get list of tables
            tables_query = "SELECT * FROM sqlite_master WHERE type='table'"
            tables = con.execute(tables_query).fetchdf()
            
            if len(tables) == 0:
                print("No tables found in the database.")
                return
                
            print("\nAvailable tables:")
            for _, row in tables.iterrows():
                table_name = row['name']
                print(f"\n{'='*80}")
                print(f"Table: {table_name}")
                
                # Get schema information
                schema_query = f"DESCRIBE {table_name}"
                schema_info = con.execute(schema_query).fetchdf()
                
                # Get row count
                row_count = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
                
                # Get distinct counts for each column
                distinct_counts = {}
                for col in schema_info['column_name']:
                    query = f"SELECT COUNT(DISTINCT {col}) FROM {table_name}"
                    distinct_counts[col] = con.execute(query).fetchone()[0]
                
                # Print results
                print(f"Total Rows: {row_count:,}")
                print("\nColumn Information:")
                print("-" * 80)
                print(f"{'Column Name':<30} {'Data Type':<20} {'Distinct Values':<15}")
                print("-" * 80)
                
                for _, row in schema_info.iterrows():
                    col_name = row['column_name']
                    print(f"{col_name:<30} {row['column_type']:<20} {distinct_counts[col_name]:,}")
                
    except Exception as e:
        print(f"Error: {str(e)}")

In [8]:
print_table_schema()


Available tables:

Table: tweets


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Total Rows: 34,761,518

Column Information:
--------------------------------------------------------------------------------
Column Name                    Data Type            Distinct Values
--------------------------------------------------------------------------------
id                             VARCHAR              32,285,002
text                           VARCHAR              31,056,384
url                            VARCHAR              32,144,972
timestamp                      TIMESTAMP            7,195,185
media                          VARCHAR              3,705,349
retweetedTweet                 BOOLEAN              2
retweetedTweetID               VARCHAR              38,261
retweetedUserID                VARCHAR              12,727
lang                           VARCHAR              101
replyCount                     BIGINT               10,499
retweetCount                   BIGINT               20,391
likeCount                      BIGINT               43,605
quoteCou

### NULL Value Statistics on tweets database

In [9]:
def analyze_null_values(db_path="tweets2.duckdb"):
    """
    Analyze null values in each column of all tables in the DuckDB database.
    
    Args:
        db_path (str): Path to the DuckDB database file
    """
    import duckdb
    from tabulate import tabulate
    
    try:
        with duckdb.connect(db_path) as con:
            # Get list of tables
            tables = con.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchdf()
            
            for _, row in tables.iterrows():
                table_name = row['name']
                print(f"\n{'='*80}")
                print(f"Table: {table_name}")
                
                # Get column names
                columns = con.execute(f"DESCRIBE {table_name}").fetchdf()['column_name'].tolist()
                
                # Calculate null statistics for each column
                null_stats = []
                for col in columns:
                    query = f"""
                    SELECT 
                        '{col}' as column_name,
                        COUNT(*) as total_rows,
                        COUNT({col}) as non_null_count,
                        COUNT(*) - COUNT({col}) as null_count,
                        ROUND(100.0 * (COUNT(*) - COUNT({col})) / COUNT(*), 2) as null_percentage
                    FROM {table_name}
                    """
                    stats = con.execute(query).fetchdf().iloc[0]
                    null_stats.append(stats)
                
                # Convert to DataFrame for nice display
                import pandas as pd
                stats_df = pd.DataFrame(null_stats)
                
                # Sort by null percentage descending
                stats_df = stats_df.sort_values('null_percentage', ascending=False)
                
                # Print formatted table
                print("\nNull Value Analysis:")
                print(tabulate(stats_df, headers='keys', tablefmt='pretty', showindex=False))
                
    except Exception as e:
        print(f"Error: {str(e)}")

# Run the analysis
analyze_null_values()


Table: tweets


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Null Value Analysis:
+-------------------+------------+----------------+------------+-----------------+
|    column_name    | total_rows | non_null_count | null_count | null_percentage |
+-------------------+------------+----------------+------------+-----------------+
|  cash_app_handle  |  34761518  |    1765592     |  32995926  |      94.92      |
|     verified      |  34761518  |    32715133    |  2046385   |      5.89       |
|        bio        |  34761518  |    32715133    |  2046385   |      5.89       |
|     username      |  34761518  |    32715133    |  2046385   |      5.89       |
|     location      |  34761518  |    32715133    |  2046385   |      5.89       |
|       text        |  34761518  |    33262431    |  1499087   |      4.31       |
|     timestamp     |  34761518  |    33262437    |  1499081   |      4.31       |
|       media       |  34761518  |    33262478    |  1499040   |      4.31       |
|       lang        |  34761518  |    33262483    |  1499035   | 

### Sampling from database at random

In [10]:
def print_random_rows(db_path="tweets2.duckdb", n_rows=10):
    """
    Print random rows from each table in the database in a nicely formatted table.
    
    Args:
        db_path (str): Path to the DuckDB database file
        n_rows (int): Number of random rows to sample
    """
    import duckdb
    from tabulate import tabulate
    
    try:
        with duckdb.connect(db_path) as con:
            # Get list of tables
            tables = con.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchdf()
            
            for _, row in tables.iterrows():
                table_name = row['name']
                print(f"\n{'='*80}")
                print(f"Table: {table_name}")
                print(f"Random {n_rows} rows:")
                
                # Query random rows
                query = f"""
                SELECT *
                FROM {table_name}
                ORDER BY RANDOM()
                LIMIT {n_rows}
                """
                sample_df = con.execute(query).fetchdf()
                
                # Print formatted table
                print(tabulate(sample_df, headers='keys', tablefmt='pretty', showindex=False))
                
    except Exception as e:
        print(f"Error: {str(e)}")

# Run the analysis
print_random_rows()


Table: tweets
Random 10 rows:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------+---------------------+----------------------------------------------+----------------+------------------+-----------------+------+------------+--------------+-----------+------------+------------------------+------------------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Statistics on top locations, and userNames

In [12]:
def analyze_top_user_stats(db_path="tweets2.duckdb", top_n=10):
    """
    Analyze top locations and usernames from the tweets database.
    
    Args:
        db_path (str): Path to the DuckDB database file
        top_n (int): Number of top entries to show
    """
    import duckdb
    from tabulate import tabulate
    
    try:
        with duckdb.connect(db_path) as con:
            # Top Locations Analysis
            location_query = f"""
            SELECT 
                location,
                COUNT(*) as tweet_count,
                COUNT(DISTINCT username) as unique_users,
                ROUND(AVG(replyCount), 2) as avg_replies,
                ROUND(AVG(retweetCount), 2) as avg_retweets,
                ROUND(AVG(likeCount), 2) as avg_likes
            FROM tweets 
            WHERE location IS NOT NULL
            GROUP BY location
            ORDER BY tweet_count DESC
            LIMIT {top_n}
            """
            
            # Top Users Analysis
            users_query = f"""
            SELECT 
                username,
                COUNT(*) as tweet_count,
                COUNT(DISTINCT location) as unique_locations,
                ROUND(AVG(replyCount), 2) as avg_replies,
                ROUND(AVG(retweetCount), 2) as avg_retweets,
                ROUND(AVG(likeCount), 2) as avg_likes,
                ROUND(AVG(LENGTH(text)), 2) as avg_tweet_length
            FROM tweets
            WHERE username IS NOT NULL
            GROUP BY username
            ORDER BY tweet_count DESC
            LIMIT {top_n}
            """
            
            # Execute queries
            top_locations = con.execute(location_query).fetchdf()
            top_users = con.execute(users_query).fetchdf()
            
            # Print results
            print(f"\n{'='*80}")
            print(f"Top {top_n} Locations by Tweet Count")
            print(tabulate(top_locations, headers='keys', tablefmt='pretty', showindex=False))
            
            print(f"\n{'='*80}")
            print(f"Top {top_n} Users by Tweet Count")
            print(tabulate(top_users, headers='keys', tablefmt='pretty', showindex=False))
            
            # Additional Statistics
            print(f"\n{'='*80}")
            print("General Statistics:")
            
            stats_query = """
            SELECT 
                COUNT(DISTINCT location) as total_unique_locations,
                COUNT(DISTINCT username) as total_unique_users,
                ROUND(AVG(replyCount), 2) as overall_avg_replies,
                ROUND(AVG(retweetCount), 2) as overall_avg_retweets,
                ROUND(AVG(likeCount), 2) as overall_avg_likes
            FROM tweets
            """
            
            stats = con.execute(stats_query).fetchdf()
            print(tabulate(stats, headers='keys', tablefmt='pretty', showindex=False))
            
    except Exception as e:
        print(f"Error: {str(e)}")

# Run the analysis
analyze_top_user_stats()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Top 10 Locations by Tweet Count
+-----------------+-------------+--------------+-------------+--------------+-----------+
|    location     | tweet_count | unique_users | avg_replies | avg_retweets | avg_likes |
+-----------------+-------------+--------------+-------------+--------------+-----------+
|                 |  13351190   |   1578490    |    5.14     |    27.67     |   76.18   |
|  United States  |   916734    |    62331     |    18.07    |     60.6     |  201.73   |
|       USA       |   489250    |    26627     |    13.63    |    39.33     |  119.89   |
|  Florida, USA   |   287995    |    18640     |    12.13    |    51.42     |  162.34   |
|   Texas, USA    |   233727    |    16500     |    14.99    |    36.62     |  100.94   |
| California, USA |   189954    |    15258     |     9.8     |    23.85     |   39.93   |
| Washington, DC  |   157711    |    11248     |   164.38    |    218.95    |  1239.49  |
|      Earth      |   146283    |    13312     |    14.34    |    3