# Financial Tweet Sentiment Labeling with Gemini

This notebook handles the labeling of financial tweets using Google's Gemini API:
1. Load CSV files
2. Process tweets through Gemini
3. Save labeled data

Sentiment Labels:
- STRONGLY_POSITIVE
- POSITIVE
- NEUTRAL
- NEGATIVE
- STRONGLY_NEGATIVE
- NOT_RELATED
- UNCERTAIN

In [None]:
import os
import pandas as pd
from glob import glob
import google.generativeai as genai
from tqdm import tqdm
import time

# Configure Gemini API
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-pro')

In [None]:
def setup_prompt():
    """Configure the system prompt for Gemini"""
    return """
    You are a financial sentiment analyzer. Classify the given tweet's sentiment into one of these categories:

    STRONGLY_POSITIVE - Very bullish, highly confident optimistic outlook
    POSITIVE - Generally optimistic, bullish view
    NEUTRAL - Factual, balanced, or no clear sentiment
    NEGATIVE - Generally pessimistic, bearish view
    STRONGLY_NEGATIVE - Very bearish, highly confident pessimistic outlook
    NOT_RELATED - Not related to financial markets or trading
    UNCERTAIN - Ambiguous or unclear sentiment

    Examples:
    "Breaking: Company XYZ doubles profit forecast!" -> STRONGLY_POSITIVE
    "Expecting modest gains next quarter" -> POSITIVE
    "Market closed at 35,000" -> NEUTRAL
    "Concerned about rising rates" -> NEGATIVE
    "Crash incoming, sell everything!" -> STRONGLY_NEGATIVE
    "Great pizza for lunch today" -> NOT_RELATED
    "Something might happen with the markets" -> UNCERTAIN

    Tweet to analyze: {text}

    Format: Return only one word from: STRONGLY_POSITIVE, POSITIVE, NEUTRAL, NEGATIVE, STRONGLY_NEGATIVE, NOT_RELATED, UNCERTAIN
    """

def get_sentiment(text, retries=3):
    """Get sentiment from Gemini with retry logic"""
    prompt = setup_prompt().format(text=text)
    
    for attempt in range(retries):
        try:
            response = model.generate_content(prompt)
            sentiment = response.text.strip().upper()
            
            # Validate the response
            valid_labels = [
                'STRONGLY_POSITIVE', 'POSITIVE', 'NEUTRAL', 'NEGATIVE',
                'STRONGLY_NEGATIVE', 'NOT_RELATED', 'UNCERTAIN'
            ]
            
            if sentiment in valid_labels:
                return sentiment
            else:
                raise ValueError(f"Invalid sentiment: {sentiment}")
                
        except Exception as e:
            if attempt == retries - 1:
                print(f"Error processing text: {text}\nError: {str(e)}")
                return 'UNCERTAIN'
            time.sleep(1)  # Wait before retry
    
    return 'UNCERTAIN'

In [None]:
def process_file(file_path):
    """Process a single CSV file"""
    print(f"Processing {file_path}")
    
    # Load data
    df = pd.read_csv(file_path)
    
    # Skip if already processed
    if 'sentiment' in df.columns and not df['sentiment'].isnull().any():
        print(f"File already processed: {file_path}")
        return
    
    # Process each tweet
    sentiments = []
    for text in tqdm(df['text'], desc="Analyzing tweets"):
        sentiment = get_sentiment(text)
        sentiments.append(sentiment)
        time.sleep(0.1)  # Rate limiting
    
    # Add sentiments to dataframe
    df['sentiment'] = sentiments
    
    # Save results
    output_path = file_path.replace('.csv', '_labeled.csv')
    df.to_csv(output_path, index=False)
    print(f"Saved labeled data to {output_path}")
    
    # Print statistics
    print("\nSentiment Distribution:")
    print(df['sentiment'].value_counts())

In [None]:
# Process all row_*.csv files
data_dir = '../data/tweets/'
files = glob(os.path.join(data_dir, 'row_*.csv'))

for file_path in files:
    process_file(file_path)
    print("\n" + "="*50 + "\n")

In [None]:
# Combine all labeled files
labeled_files = glob(os.path.join(data_dir, '*_labeled.csv'))
combined_data = []

for file_path in labeled_files:
    df = pd.read_csv(file_path)
    combined_data.append(df)

final_df = pd.concat(combined_data, ignore_index=True)
final_df.to_csv('../data/all_labeled_tweets.csv', index=False)

print("\nFinal Dataset Statistics:")
print(f"Total tweets: {len(final_df)}")
print("\nSentiment Distribution:")
print(final_df['sentiment'].value_counts())