In [7]:
def run_query(query, db_path="nl2sql_agent/database/tweets.duckdb"):
    """
    Execute a SQL query against the DuckDB database and return results as a pandas DataFrame.
    
    Args:
        query (str): SQL query to execute
        db_path (str): Path to the DuckDB database file
        
    Returns:
        pandas.DataFrame: Query results
    """
    with duckdb.connect(db_path) as con:
        return con.execute(query).fetchdf()
    
query = "SELECT lang, SUM(likeCount) AS totalLikes FROM tweets GROUP BY lang ORDER BY totalLikes DESC LIMIT 5"
results = run_query(query)

In [8]:
results

Unnamed: 0,lang,totalLikes
0,0,1.814655e+18
1,"{'count': '146', 'state': 'EnabledWithCount'}",9.28061e+17
2,"{'count': '35', 'state': 'EnabledWithCount'}",582452100.0
3,en,491703600.0
4,es,9732924.0


In [13]:
def investigate_lang_values(db_path="database/tweets.duckdb"):
    """
    Investigates potentially problematic language values in the dataset.
    """
    query = """
    SELECT 
        CASE 
            WHEN lang IS NULL THEN 'NULL'
            WHEN lang = '' THEN 'EMPTY'
            ELSE lang 
        END as lang_value,
        COUNT(*) as count,
        SUM(likeCount) as total_likes
    FROM tweets 
    GROUP BY lang
    ORDER BY count DESC
    LIMIT 10
    """
    
    results = run_query(query, db_path)
    print("\nLanguage value analysis:")
    print("-" * 50)
    for _, row in results.iterrows():
        print(f"Value: {str(row['lang_value']):<40} Count: {row['count']:,}")
        print(f"Total Likes: {row['total_likes']:,}")
        print("-" * 50)

#### Unique Languages in the data and issues related to the column

In [14]:
print_unique_langs()


Found 103 unique language values:
----------------------------------------
Language: en              Count: 17,451,496
Language: qme             Count: 692,539
Language: es              Count: 388,020
Language: und             Count: 139,340
Language: fr              Count: 136,226
Language: pt              Count: 135,501
Language: et              Count: 107,133
Language: qht             Count: 95,801
Language: de              Count: 88,517
Language: ja              Count: 85,249
Language: it              Count: 64,833
Language: tr              Count: 63,103
Language: tl              Count: 62,417
Language: eu              Count: 59,243
Language: zxx             Count: 56,794
Language: qam             Count: 45,304
Language: in              Count: 45,236
Language: nl              Count: 41,884
Language: pl              Count: 23,988
Language: art             Count: 19,025
Language: no              Count: 18,850
Language: ht              Count: 16,183
Language: ca              Count: 1

## Sample from raw data for preliminary analysis

In [6]:
def sample_partition():
    import pandas as pd
    import glob
    import os
    
    # Get the first CSV.GZ file from part_1
    pattern = os.path.join("sample_data/part_1", "*.csv.gz")
    files = glob.glob(pattern)
    
    if not files:
        raise FileNotFoundError("No CSV.GZ files found in sample_data/part_1")
    
    # Take the first file found
    file_path = files[0]
    print(f"Sampling from: {file_path}")
    
    # Read 1000 random rows
    # Use chunksize for memory efficiency when dealing with large files
    df = pd.read_csv(file_path, compression='gzip', nrows=100000)
    
    return df

# sample_df = sample_partition()
# print("\nSample shape:", sample_df.shape)
# print("\nSample columns:", sample_df.columns.tolist())
# print("\nFirst few rows:")
# print(sample_df.head())

In [7]:
import os 

def sample_and_save_partition(output_path="sample_data/sampled_tweets_b.csv"):
    """
    Sample 1000 rows from a partition and save to CSV.
    
    Args:
        output_path (str): Path where the sampled CSV will be saved
    """
    # Get the sample
    df = sample_partition()
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_path, index=False)
    print(f"\nSaved {len(df)} rows to: {output_path}")
    
    return df

# Example usage:
if __name__ == "__main__":
    df = sample_and_save_partition()

Sampling from: sample_data/part_1/may_july_chunk_11.csv.gz

Saved 50000 rows to: sample_data/sampled_tweets_b.csv


In [4]:
def analyze_columns(df):
    """
    Analyze each column in the dataframe and provide detailed information about data types,
    sample values, and basic statistics.
    
    Args:
        df (pd.DataFrame): Input dataframe to analyze
        
    Returns:
        dict: Dictionary containing analysis results for each column
    """
    analysis = {}
    
    for column in df.columns:
        # Initialize column analysis
        col_data = {
            'dtype': str(df[column].dtype),
            'null_count': df[column].isna().sum(),
            'null_percentage': (df[column].isna().sum() / len(df)) * 100,
            'unique_count': df[column].nunique(),
            'sample_values': df[column].dropna().head(3).tolist()
        }
        
        # Add type-specific analysis
        if df[column].dtype in ['int64', 'float64']:
            col_data.update({
                'min': df[column].min(),
                'max': df[column].max(),
                'mean': df[column].mean(),
                'median': df[column].median()
            })
        elif df[column].dtype == 'object':
            # For string/object columns
            non_null_values = df[column].dropna()
            if len(non_null_values) > 0:
                col_data.update({
                    'avg_length': non_null_values.str.len().mean(),
                    'max_length': non_null_values.str.len().max(),
                    'contains_numbers': any(str(x).replace('.','',1).isdigit() 
                                         for x in non_null_values.head(100)),
                    'most_common': df[column].value_counts().head(3).to_dict()
                })
        
        analysis[column] = col_data
    
    return analysis

def print_column_analysis(df):
    """
    Print a formatted analysis of all columns in the dataframe.
    
    Args:
        df (pd.DataFrame): Input dataframe to analyze
    """
    analysis = analyze_columns(df)
    
    print("\nColumn Analysis Report")
    print("=" * 80)
    
    for column, data in analysis.items():
        print(f"\nColumn: {column}")
        print("-" * 40)
        print(f"Data Type: {data['dtype']}")
        print(f"Null Count: {data['null_count']} ({data['null_percentage']:.2f}%)")
        print(f"Unique Values: {data['unique_count']}")
        print(f"Sample Values: {data['sample_values']}")
        
        # Print numeric statistics if available
        if 'mean' in data:
            print(f"Numeric Statistics:")
            print(f"  Min: {data['min']}")
            print(f"  Max: {data['max']}")
            print(f"  Mean: {data['mean']:.2f}")
            print(f"  Median: {data['median']}")
        
        # Print string statistics if available
        if 'avg_length' in data:
            print(f"String Statistics:")
            print(f"  Average Length: {data['avg_length']:.2f}")
            print(f"  Max Length: {data['max_length']}")
            print(f"  Contains Numbers: {data['contains_numbers']}")
            print(f"  Most Common Values:")
            for value, count in data['most_common'].items():
                print(f"    - {value}: {count} occurrences")

# Example usage:
if __name__ == "__main__":
    df = sample_partition()
    print_column_analysis(df)

Sampling from: sample_data/part_1/may_july_chunk_11.csv.gz

Column Analysis Report

Column: id
----------------------------------------
Data Type: int64
Null Count: 0 (0.00%)
Unique Values: 1000
Sample Values: [1801321015319896315, 1801321014204178511, 1801321014040686814]
Numeric Statistics:
  Min: 1801320226736218583
  Max: 1801321015319896315
  Mean: 1801320659818732544.00
  Median: 1.8013207082866342e+18

Column: text
----------------------------------------
Data Type: object
Null Count: 0 (0.00%)
Unique Values: 998
Sample Values: ['@BidenHQ Thank you, PM Meloni, for shepherding our "lost little POTUS" back to the herd.\nI\'m so sorry you and Macron, etc., have to provide nursing care Biden\nThis is REALLY not in your job description.', '@MuskUniverseUsa 9', '@BidenHQ Lmfao. Biden has fallen down 17 times this year. You’re finding the one video where it was raining and he was wearing dress shoes down a metal ramp 😂😂😂 you guy have nothing on him.  \nGo find the cheer he got from tho

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [4]:
def get_first_five_rows(db_path="nl2sql_agent/database/tweets.duckdb"):
    """
    Get first 5 rows from tweets table and return as pandas DataFrame
    """
    query = "SELECT * FROM tweets LIMIT 5"
    with duckdb.connect(db_path) as con:
        df = con.execute(query).fetchdf()
    return df

# Execute and display results
df = get_first_five_rows()
print(df)

                    id                                               text  \
0  1801041792923578484  @lukepbeasley I cant imagine anyone actually f...   
1  1801041792630227173  Voters can also sway me away from voting  for ...   
2  1801041792592224521  @PoodleHead57 @BobOnderMO Can you name that am...   
3  1801041791463866688  @Morning_Joe @JoeNBC The fact remains that Joe...   
4  1801041790952231228  @BidenHQ That's funny you're obviously trying ...   

                                                 url       epoch media  \
0  https://twitter.com/orgneyezedchaos/status/180...  1718236799    []   
1  https://twitter.com/Brandon62294232/status/180...  1718236799    []   
2  https://twitter.com/JohnRMBR911/status/1801041...  1718236799    []   
3  https://twitter.com/andy_leq/status/1801041791...  1718236799    []   
4  https://twitter.com/Ranchhandlb7/status/180104...  1718236799    []   

  retweetedTweet retweetedTweetID retweetedUserID               id_str lang  \
0          Fa

In [5]:
def get_retweets(db_path="nl2sql_agent/database/tweets.duckdb", limit=5):
    """
    Get rows where retweetedTweetId is not null
    
    Args:
        db_path (str): Path to database
        limit (int): Number of rows to return
    """
    query = """
    SELECT *
    FROM tweets 
    WHERE retweetedTweetId IS NOT NULL
    LIMIT {}
    """.format(limit)
    
    with duckdb.connect(db_path) as con:
        df = con.execute(query).fetchdf()
    return df

# Execute and display results 
df = get_retweets()
print(df)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

                                                  id  \
0  Yes of course its a physically demanding job b...   
1                              1.805027817471792e+18   
2                               1.80502769733014e+18   
3                             1.8050277992389263e+18   
4                              1.805027642371888e+18   

                                                text  \
0                                the laziest PM ever   
1  RT @drvolts: Trump's cognitive state is so muc...   
2  RT @GuntherEagleman: Biden’s Rally in PA 😂😂😂 h...   
3  RT @TheWeekendMSNBC: The Biden campaign is gea...   
4  RT @wajacobson: Biden slowed down arms deliver...   

                                                 url  \
0                       managed it-am sure he will!"   
1  https://twitter.com/lizditz/status/18050278174...   
2  https://twitter.com/BobSmit54406315/status/180...   
3  https://twitter.com/MLGoley/status/18050277992...   
4  https://twitter.com/JeffSpartyjeff/status/1

In [4]:
def print_table_schema(db_path="tweets2.duckdb"):
    """
    Connect to tweets2.duckdb and print schema and column information.
    First lists available tables, then prints schema for each table.
    
    Args:
        db_path (str): Path to the DuckDB database file
    """
    try:
        # Connect to database
        with duckdb.connect(db_path) as con:
            # Get list of tables
            tables_query = "SELECT * FROM sqlite_master WHERE type='table'"
            tables = con.execute(tables_query).fetchdf()
            
            if len(tables) == 0:
                print("No tables found in the database.")
                return
                
            print("\nAvailable tables:")
            for _, row in tables.iterrows():
                table_name = row['name']
                print(f"\n{'='*80}")
                print(f"Table: {table_name}")
                
                # Get schema information
                schema_query = f"DESCRIBE {table_name}"
                schema_info = con.execute(schema_query).fetchdf()
                
                # Get row count
                row_count = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
                
                # Get distinct counts for each column
                distinct_counts = {}
                for col in schema_info['column_name']:
                    query = f"SELECT COUNT(DISTINCT {col}) FROM {table_name}"
                    distinct_counts[col] = con.execute(query).fetchone()[0]
                
                # Print results
                print(f"Total Rows: {row_count:,}")
                print("\nColumn Information:")
                print("-" * 80)
                print(f"{'Column Name':<30} {'Data Type':<20} {'Distinct Values':<15}")
                print("-" * 80)
                
                for _, row in schema_info.iterrows():
                    col_name = row['column_name']
                    print(f"{col_name:<30} {row['column_type']:<20} {distinct_counts[col_name]:,}")
                
    except Exception as e:
        print(f"Error: {str(e)}")

In [6]:
print_table_schema()


Available tables:

Table: tweets
Total Rows: 34,761,518

Column Information:
--------------------------------------------------------------------------------
Column Name                    Data Type            Distinct Values
--------------------------------------------------------------------------------
id                             VARCHAR              32,285,002
text                           VARCHAR              31,056,384
url                            VARCHAR              32,144,972
timestamp                      TIMESTAMP            7,195,185
media                          VARCHAR              3,705,349
retweetedTweet                 BOOLEAN              2
retweetedTweetID               VARCHAR              38,261
retweetedUserID                VARCHAR              12,727
lang                           VARCHAR              101
replyCount                     BIGINT               10,499
retweetCount                   BIGINT               20,391
likeCount                      BI