# Financial Tweet Data Preparation

This notebook handles the preparation of financial tweets data:
1. Loading CSV files
2. Data cleaning
3. Preparing for Gemini labeling
4. Data preprocessing for model training

In [None]:
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)

## 1. Load Tweet Data

Load all CSV files from the data/tweets directory that start with 'row_'

In [None]:
def load_tweet_files(data_dir='../data/tweets/'):
    """Load all row_*.csv files from the specified directory"""
    all_files = glob(os.path.join(data_dir, 'row_*.csv'))
    
    dataframes = []
    for file in tqdm(all_files, desc='Loading files'):
        try:
            df = pd.read_csv(file)
            # Add source file name as a column
            df['source_file'] = os.path.basename(file)
            dataframes.append(df)
        except Exception as e:
            print(f"Error loading {file}: {str(e)}")
    
    return pd.concat(dataframes, ignore_index=True)

# Load all tweet data
df = load_tweet_files()
print(f"Total tweets loaded: {len(df)}")
df.head()

## 2. Data Cleaning

Clean the tweet text and remove any unnecessary information

In [None]:
def clean_tweet(text):
    """Basic tweet cleaning function"""
    if pd.isna(text):
        return ""
    
    # Convert to string if not already
    text = str(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text.strip()

# Clean the tweets
df['cleaned_text'] = df['text'].apply(clean_tweet)

# Remove empty tweets
df = df[df['cleaned_text'].str.len() > 0].reset_index(drop=True)
print(f"Tweets after cleaning: {len(df)}")

## 3. Prepare for Gemini Labeling

Create a format suitable for Gemini API labeling

In [None]:
def prepare_gemini_format(df, batch_size=100):
    """Prepare data in batches for Gemini API"""
    # Select relevant columns
    labeling_df = df[['cleaned_text', 'source_file']].copy()
    
    # Create batches
    num_batches = (len(labeling_df) + batch_size - 1) // batch_size
    batches = []
    
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(labeling_df))
        batch = labeling_df.iloc[start_idx:end_idx]
        batches.append(batch)
    
    return batches

# Prepare batches for labeling
batches = prepare_gemini_format(df)
print(f"Number of batches: {len(batches)}")

# Save first batch as example
example_batch = batches[0]
example_batch.to_csv('../data/batch_0_for_labeling.csv', index=False)
example_batch.head()

## 4. Data Statistics

Calculate some basic statistics about our dataset

In [None]:
def calculate_statistics(df):
    """Calculate and display dataset statistics"""
    stats = {
        'Total Tweets': len(df),
        'Unique Sources': df['source_file'].nunique(),
        'Avg Tweet Length': df['cleaned_text'].str.len().mean(),
        'Max Tweet Length': df['cleaned_text'].str.len().max(),
        'Min Tweet Length': df['cleaned_text'].str.len().min()
    }
    
    return pd.Series(stats)

# Display statistics
stats = calculate_statistics(df)
print("\nDataset Statistics:")
print(stats)

## Next Steps

1. Use Gemini API to label the batches (will be provided by user)
2. Process the labeled data
3. Prepare final dataset for model training