# Icebreaker Generator via Claude Haiku API

This notebook generates personalized icebreaker messages for cold outreach using Claude Haiku.

In [None]:
import pandas as pd
import anthropic
import os
import time
from datetime import datetime

# Paths
csv_path = r"C:\Users\79818\Downloads\call centers US UK Aus 10-100 - 10-50.csv"
output_dir = r"C:\Users\79818\Desktop\Outreach - new\data\processed"

# Create output dir
os.makedirs(output_dir, exist_ok=True)

print(f"CSV Path: {csv_path}")
print(f"Output Dir: {output_dir}")

In [None]:
# Load CSV
df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} rows")
print(f"Columns: {list(df.columns)[:10]}...")

In [None]:
# Initialize API client
api_key = os.environ.get('ANTHROPIC_API_KEY')
print(f"API Key present: {bool(api_key)}")

client = anthropic.Anthropic(api_key=api_key)
print("Anthropic client initialized")

In [None]:
# Test API with simple message
test_response = client.messages.create(
    model="claude-haiku-4-5-20251001",
    max_tokens=30,
    messages=[{"role": "user", "content": "Say OK"}]
)
print(f"API Test: {test_response.content[0].text}")

In [None]:
def generate_icebreaker(full_name, company_name, title, headline, city):
    """Generate a single icebreaker via API."""

    prompt = f"""You are an outreach message generator.
Your role: create short, casual, human-sounding icebreaker messages for LinkedIn-style outreach.
Goal: make the recipient feel recognized for their work without sounding pushy or overly formal.

Your task:
- If "full_name" looks like a company (contains words such as Company, Inc, LLC, Ltd, Group, Realty, Properties, Brokers, UAE, Dubai, Abu Dhabi):
    -> Output ONLY: its a company

- Else (if "full_name" is a person):
    1. Extract firstName = first word of full_name.
    2. Normalize company_name into shortCompany:
        - If ALL CAPS -> convert to Title Case (only first letter uppercase).
        - Remove corporate suffixes: Properties, Realty, Group, Brokers, LLC, Ltd, Inc, UAE, Dubai, Abu Dhabi.
        - Remove apostrophes or special symbols.
    3. Normalize region into shortRegion:
        - Dubai, Abu Dhabi -> Dubai
        - San Francisco -> SF
        - New York City -> NYC
        - Else keep original.
    4. Generate opening = pick randomly, in casual tone:
        - love how you
        - really like how you
        - awesome to see you
        - impressed by how you
        - great track with how you
        - cool to see you
    5. specializationPhrase:
        - Look at headline or title.
        - If clear keyword (luxury, sales, marketing, engineering, talent acquisition, product, design, etc.) -> rewrite naturally as an action (2-3 words).
            * Example: "Luxury Consultant" -> "drive luxury sales"
            * "Marketing Manager" -> "lead marketing"
            * "Talent Acquisition" -> "grow teams"
            * "Software Engineer" -> "build products"
        - If generic title -> simplify to meaningful action:
            * "Consultant" -> "work with clients"
            * "Broker" -> "push sales"
            * "Analyst" -> "dig into insights"
        - If nothing useful -> fallback: "bring industry experience".
    6. regionPhrase = pick randomly:
        - I'm also in the {{shortRegion}} market
        - I work across {{shortRegion}} as well
        - I'm active in {{shortRegion}} too
        - I also focus on {{shortRegion}}
    7. closingPhrase = pick randomly:
        - Wanted to run something by you.
        - Thought I'd share an idea with you.
        - Had something you might find interesting.
        - Figured I'd reach out quickly.

Final Output (always one line, no labels, no JSON):
Hey {{firstName}}, {{opening}} {{specializationPhrase}} at {{shortCompany}} - {{regionPhrase}}. {{closingPhrase}}

Context for this row:
full_name: {full_name}
company_name: {company_name}
title: {title}
headline: {headline}
city: {city}"""

    response = client.messages.create(
        model="claude-haiku-4-5-20251001",
        max_tokens=200,
        messages=[{"role": "user", "content": prompt}]
    )

    return response.content[0].text.strip()

print("Function defined")

In [None]:
# Test with first row
test_row = df.iloc[0]
print(f"Test Row: {test_row['full_name']} @ {test_row['company_name']}")

test_icebreaker = generate_icebreaker(
    test_row['full_name'],
    test_row['company_name'],
    test_row['title'],
    test_row.get('headline', ''),
    test_row.get('city', '')
)

print(f"\nGenerated: {test_icebreaker}")

In [None]:
# Process all rows
BATCH_SIZE = 200
total_rows = len(df)
total_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Total rows: {total_rows}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Total batches: {total_batches}")
print(f"\nEstimated time: ~{total_rows * 1 / 60:.1f} minutes (assuming 1 API call/sec)")

In [None]:
# Process in batches
all_results = []
start_time = time.time()

for batch_num in range(1, total_batches + 1):
    batch_start_idx = (batch_num - 1) * BATCH_SIZE
    batch_end_idx = min(batch_num * BATCH_SIZE, total_rows)
    
    batch_df = df.iloc[batch_start_idx:batch_end_idx].copy()
    
    print(f"\nBATCH {batch_num}/{total_batches} - Processing rows {batch_start_idx+1}-{batch_end_idx}...")
    batch_start_time = time.time()
    
    icebreakers = []
    stats = {'successful': 0, 'companies': 0, 'errors': 0}
    
    for idx, row in batch_df.iterrows():
        try:
            full_name = str(row.get('full_name', '')).strip()
            company_name = str(row.get('company_name', '')).strip()
            title = str(row.get('title', '')).strip()
            headline = str(row.get('headline', '')).strip()
            city = str(row.get('city', '')).strip()
            
            if not full_name or full_name == 'nan':
                icebreakers.append("ERROR: Missing full_name")
                stats['errors'] += 1
                continue
            
            result = generate_icebreaker(full_name, company_name, title, headline, city)
            
            if "its a company" in result.lower():
                stats['companies'] += 1
            
            icebreakers.append(result)
            stats['successful'] += 1
            
            # Progress every 20 rows
            local_idx = idx - batch_df.index[0] + 1
            if local_idx % 20 == 0:
                print(f"  -> {local_idx}/{len(batch_df)} rows processed")
        
        except Exception as e:
            error_msg = f"ERROR: {str(e)[:40]}"
            icebreakers.append(error_msg)
            stats['errors'] += 1
            print(f"  ⚠ Error at row {idx}: {error_msg}")
    
    batch_df['icebreaker'] = icebreakers
    batch_time = time.time() - batch_start_time
    
    print(f"  ✓ Batch completed in {batch_time:.1f}s")
    print(f"  -> Success: {stats['successful']}, Companies: {stats['companies']}, Errors: {stats['errors']}")
    
    # Save batch
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    batch_output = os.path.join(output_dir, f"apollo_batch_{batch_num}_{timestamp}.csv")
    batch_df.to_csv(batch_output, index=False, encoding='utf-8')
    print(f"  -> Saved: {batch_output}")
    
    all_results.append(batch_df)

total_time = time.time() - start_time
print(f"\n{'='*60}")
print(f"ALL BATCHES COMPLETED in {total_time:.1f}s ({total_time/60:.1f} minutes)")
print(f"{'='*60}")

In [None]:
# Merge all batches
final_df = pd.concat(all_results, ignore_index=True)

# Save final result
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
final_output = os.path.join(output_dir, f"apollo_icebreaker_analyzed_{timestamp}.csv")
final_df.to_csv(final_output, index=False, encoding='utf-8')

print(f"✓ Final CSV saved: {final_output}")
print(f"\nFinal file size: {os.path.getsize(final_output) / 1024 / 1024:.1f} MB")

In [None]:
# Generate statistics
total_processed = len(final_df)
successful = sum(1 for x in final_df['icebreaker'] if x and 'ERROR' not in x and 'its a company' not in x.lower())
companies = sum(1 for x in final_df['icebreaker'] if 'its a company' in str(x).lower())
errors = sum(1 for x in final_df['icebreaker'] if 'ERROR' in str(x))

print(f"\n{'='*60}")
print("FINAL STATISTICS")
print(f"{'='*60}")
print(f"Total rows processed: {total_processed}")
print(f"Successful icebreakers: {successful}")
print(f"Companies detected: {companies}")
print(f"Errors: {errors}")
print(f"Success rate: {successful/total_processed*100:.1f}%")

In [None]:
# Show sample icebreakers
print(f"\n{'='*60}")
print("SAMPLE ICEBREAKERS (first 10 valid ones)")
print(f"{'='*60}\n")

sample_count = 0
for idx, row in final_df.iterrows():
    icebreaker = row['icebreaker']
    
    # Skip errors and company detections
    if 'ERROR' in str(icebreaker) or 'its a company' in str(icebreaker).lower():
        continue
    
    sample_count += 1
    print(f"[{sample_count}] {row['full_name']} @ {row['company_name']}")
    print(f"    Title: {row['title']}")
    print(f"    Icebreaker: {icebreaker}")
    print()
    
    if sample_count >= 10:
        break

print(f"{'='*60}")