# Data Collection: Code-Switching Benchmark

## Setup and Environment Configuration


In [None]:
# --- Environment Setup ---

import os
from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm

# Go to the project root (only if needed)
if ".env" not in os.listdir():
    os.chdir("/Users/jase/codeswitch-benchmark")

# Load environment variables
load_dotenv()

# Check API keys
print("API Keys Status:")
print(f"  Gemini:  {bool(os.getenv('GEMINI_API_KEY'))}")
print(f"  Cohere:  {bool(os.getenv('COHERE_API_KEY'))}")
print(f"  Mistral: {bool(os.getenv('MISTRAL_API_KEY'))}")


API Keys Status:
  Gemini:  True
  Cohere:  True
  Mistral: True


## Create Stimuli Dataset


In [None]:
# Ensure data directory exists
os.makedirs("../data/raw", exist_ok=True)

# Define balanced test set across language varieties and tasks
# Define balanced test set across language varieties and tasks
# Updated with expanded trigger words for more useful data analysis
stimuli_data = [
    # African American Vernacular English (AAVE)
    {"id": "aave_01", "variety": "AAVE", "task": "paraphrase",
     "text": "He finna go to the store. You sliding?"},
    {"id": "aave_02", "variety": "AAVE", "task": "explain",
     "text": "Ion think that plan gon' work."},
    {"id": "aave_03", "variety": "AAVE", "task": "continue",
     "text": "We was tryna finish that yesterday"},

    # Spanglish (Spanish-English code-switching)
    {"id": "span_01", "variety": "Spanglish", "task": "paraphrase",
     "text": "Vamos later, it's muy close to la tienda."},
    {"id": "span_02", "variety": "Spanglish", "task": "explain",
     "text": "No entiendo bien, pero I think it's fine."},
    {"id": "span_03", "variety": "Spanglish", "task": "continue",
     "text": "We can meet en el parque, like at 5."},

    # British English
    {"id": "br_01", "variety": "BrEng", "task": "paraphrase",
     "text": "Put it in the lorry outside the flat."},
    {"id": "br_02", "variety": "BrEng", "task": "explain",
     "text": "Take the lift, not the stairs, to the first floor."},
    {"id": "br_03", "variety": "BrEng", "task": "continue",
     "text": "We're off on holiday next week, fancy it?"},

    # Standard English (control group)
    {"id": "std_01", "variety": "StdEng", "task": "paraphrase",
     "text": "He is about to head out. Are you coming?"},
    {"id": "std_02", "variety": "StdEng", "task": "explain",
     "text": "Please explain this in simple terms."},
    {"id": "std_03", "variety": "StdEng", "task": "continue",
     "text": "We should wrap this up and send it."},
]

# Create and save stimuli dataset
stimuli = pd.DataFrame(stimuli_data)
stimuli.to_csv("../data/raw/stimuli.csv", index=False)

print(f"Created stimuli dataset: {len(stimuli)} examples")
print(f"Varieties: {stimuli['variety'].unique()}")
print(f"Tasks: {stimuli['task'].unique()}")
print(f"Examples per variety: {stimuli['variety'].value_counts().to_dict()}")
stimuli.head()


Created stimuli dataset: 21 examples
Varieties: ['AAVE' 'Spanglish' 'BrEng' 'StdEng']
Tasks: ['paraphrase' 'explain' 'continue']


Unnamed: 0,id,variety,task,text
0,aave_01,AAVE,paraphrase,He finna go to the store. You sliding?
1,aave_02,AAVE,explain,"Ion think that plan gon' work, fr."
2,aave_03,AAVE,continue,"We was tryna finish that yesterday, but bruh f..."
3,aave_04,AAVE,paraphrase,"Nah, that don’t make sense, y’all wildin’."
4,aave_05,AAVE,continue,"We gon’ chill after class, no cap."


## Setup Gemini API


In [None]:
# === Gemini API Setup ===
# Initialize and test Gemini connection

import google.generativeai as genai

# Configure Gemini API
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
model = genai.GenerativeModel("models/gemini-2.5-flash")

def query_gemini(prompt: str, max_retries: int = 3):
    """Send a prompt to Gemini and return plain text output with retry logic."""
    import time
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            if response.text:
                return response.text
            else:
                print(f"Empty response from Gemini (attempt {attempt + 1})")
        except Exception as e:
            print(f"Error querying Gemini (attempt {attempt + 1}): {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                return f"ERROR: Failed after {max_retries} attempts: {e}"
    return None

print("Gemini API configured successfully")


TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

## Collect Gemini Responses


In [None]:
# Load stimuli and collect responses from Gemini
stimuli = pd.read_csv("../data/raw/stimuli.csv")
responses = []

print("Collecting responses from Gemini...")
print(f"Processing {len(stimuli)} examples...")

for i, row in tqdm(stimuli.iterrows(), total=len(stimuli), desc="Gemini API calls"):
    prompt = f"Paraphrase or continue this text in the same dialectal style: {row.text}"
    output = query_gemini(prompt)
    
    # Validate response quality
    if output and not output.startswith("ERROR:"):
        responses.append({
            "id": row.id,
            "variety": row.variety,
            "task": row.task,
            "input_text": row.text,
            "output_text": output,
            "success": True
        })
    else:
        print(f"Failed to get response for {row.id}: {output}")
        responses.append({
            "id": row.id,
            "variety": row.variety,
            "task": row.task,
            "input_text": row.text,
            "output_text": output or "ERROR: No response",
            "success": False
        })

# Save Gemini responses
gemini_df = pd.DataFrame(responses)
gemini_df.to_csv("../data/raw/gemini_responses.csv", index=False)

# Quality check
success_count = gemini_df['success'].sum()
print(f"Saved Gemini responses: {len(gemini_df)} examples")
print(f"Successful responses: {success_count}/{len(gemini_df)} ({success_count/len(gemini_df)*100:.1f}%)")
print(f"Variety distribution: {gemini_df['variety'].value_counts().to_dict()}")

gemini_df.head()


Collecting responses from Gemini...


E0000 00:00:1759276369.763989  237506 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
100%|██████████| 12/12 [02:08<00:00, 10.72s/it]

Saved Gemini responses: 12 examples





Unnamed: 0,id,variety,task,input_text,output_text
0,aave_01,AAVE,paraphrase,He finna go to the store. You sliding?,"Here are a few options, both paraphrasing and ..."
1,aave_02,AAVE,explain,Ion think that plan gon' work.,Here are a few ways to paraphrase or continue ...
2,aave_03,AAVE,continue,We was tryna finish that yesterday,Here are a few ways to paraphrase or continue ...
3,span_01,Spanglish,paraphrase,"Vamos later, it's muy close to la tienda.",Here are a few ways to paraphrase or continue ...
4,span_02,Spanglish,explain,"No entiendo bien, pero I think it's fine.","Here are a few options, either paraphrasing or..."


## Test Cohere API Integration


In [None]:
import sys

# Setup path for imports
current_dir = os.getcwd()
src_path = os.path.join('..', 'src') if current_dir.endswith('notebooks') else 'src'
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Test Cohere connection and collect sample responses
from debug_cohere import debug_cohere

print("Testing Cohere API connection...")
if debug_cohere():
    print("Cohere API connection successful")
    
    from adapters.cohere_adapter import query_cohere
    
    # Collect full dataset responses
    cohere_responses = []
    print(f"Collecting Cohere responses for {len(stimuli)} examples...")

    for i, row in tqdm(stimuli.iterrows(), total=len(stimuli), desc="Cohere API calls"):
        prompt = f"Paraphrase or continue this text in the same dialectal style: {row.text}"
        try:
            output = query_cohere(prompt)
            if output and not output.startswith("ERROR:"):
                cohere_responses.append({
                    "id": row.id,
                    "variety": row.variety,
                    "task": row.task,
                    "input_text": row.text,
                    "output_text": output,
                    "success": True
                })
                print(f"Success: {row.id}")
            else:
                print(f"Empty response for {row.id}")
                cohere_responses.append({
                    "id": row.id,
                    "variety": row.variety,
                    "task": row.task,
                    "input_text": row.text,
                    "output_text": output or "ERROR: Empty response",
                    "success": False
                })
        except Exception as e:
            print(f"Error {row.id}: {e}")
            cohere_responses.append({
                "id": row.id,
                "variety": row.variety,
                "task": row.task,
                "input_text": row.text,
                "output_text": f"ERROR: {e}",
                "success": False
            })

    cohere_df = pd.DataFrame(cohere_responses)
    cohere_df.to_csv("../data/raw/cohere_responses.csv", index=False)
    
    # Quality check
    success_count = cohere_df['success'].sum()
    print(f"Saved Cohere responses: {len(cohere_df)} examples")
    print(f"Successful responses: {success_count}/{len(cohere_df)} ({success_count/len(cohere_df)*100:.1f}%)")
    print(f"Variety distribution: {cohere_df['variety'].value_counts().to_dict()}")
    
    cohere_df.head()
else:
    print("Cohere API connection failed. Check API key and configuration.")


Testing Cohere API connection...
🚀 Cohere Debug Agent - Full Diagnosis

🔧 Running Comprehensive Diagnosis...
🔍 Cohere Debug Agent Starting...
✅ Found project root: /Users/jase/codeswitch-benchmark
✅ Found .env file: /Users/jase/codeswitch-benchmark/.env
✅ Environment variables loaded
✅ API key found (prefix: SgUydfH)
✅ API key length: 40 characters
✅ Cohere client initialized

🧪 Testing Cohere API Connection...

🔄 Testing command-r-plus-08-2024...
✅ command-r-plus-08-2024 works: Hello!

✅ All tests passed! Cohere API is working correctly.

🔧 Fixing Notebook Path Issues...
✅ Added /Users/jase/codeswitch-benchmark/src to Python path

🎉 All systems operational!
You can now use the Cohere API in your notebooks.
Cohere API connection successful


 50%|█████     | 1/2 [00:01<00:01,  1.07s/it]

Success: aave_01


100%|██████████| 2/2 [00:01<00:00,  1.16it/s]

Success: aave_02
Saved Cohere sample responses: 2 examples





## Test Mistral API Integration


In [None]:
from debug_mistral import debug_mistral

print("Testing Mistral API connection...")
if debug_mistral():
    print("Mistral API connection successful")
    
    from adapters.mistral_adapter import query_mistral
    
    # Collect full dataset responses
    mistral_responses = []
    print(f"Collecting Mistral responses for {len(stimuli)} examples...")

    for i, row in tqdm(stimuli.iterrows(), total=len(stimuli), desc="Mistral API calls"):
        prompt = f"Paraphrase or continue this text in the same dialectal style: {row.text}"
        try:
            output = query_mistral(prompt)
            if output and not output.startswith("ERROR:"):
                mistral_responses.append({
                    "id": row.id,
                    "variety": row.variety,
                    "task": row.task,
                    "input_text": row.text,
                    "output_text": output,
                    "success": True
                })
                print(f"Success: {row.id}")
            else:
                print(f"Empty response for {row.id}")
                mistral_responses.append({
                    "id": row.id,
                    "variety": row.variety,
                    "task": row.task,
                    "input_text": row.text,
                    "output_text": output or "ERROR: Empty response",
                    "success": False
                })
        except Exception as e:
            print(f"Error {row.id}: {e}")
            mistral_responses.append({
                "id": row.id,
                "variety": row.variety,
                "task": row.task,
                "input_text": row.text,
                "output_text": f"ERROR: {e}",
                "success": False
            })

    mistral_df = pd.DataFrame(mistral_responses)
    mistral_df.to_csv("../data/raw/mistral_responses.csv", index=False)
    
    # Quality check
    success_count = mistral_df['success'].sum()
    print(f"Saved Mistral responses: {len(mistral_df)} examples")
    print(f"Successful responses: {success_count}/{len(mistral_df)} ({success_count/len(mistral_df)*100:.1f}%)")
    print(f"Variety distribution: {mistral_df['variety'].value_counts().to_dict()}")
    
    mistral_df.head()
else:
    print("Mistral API connection failed. Check API key and configuration.")


Testing Mistral API connection...
🚀 Mistral Debug Agent - Full Diagnosis

🔧 Running Comprehensive Diagnosis...
🔍 Mistral Debug Agent Starting...
✅ Found project root: /Users/jase/codeswitch-benchmark
✅ Found .env file: /Users/jase/codeswitch-benchmark/.env
✅ Environment variables loaded
✅ API key found (prefix: YWdltFw0j8)
✅ API key length: 32 characters
✅ Mistral client initialized

🧪 Testing Mistral API Connection...

🔄 Testing mistral-large-latest...
✅ mistral-large-latest works: Hi!

✅ All tests passed! Mistral API is working correctly.

🔧 Fixing Notebook Path Issues...
✅ /Users/jase/codeswitch-benchmark/src already in Python path

🎉 All systems operational!
You can now use the Mistral API in your notebooks.
Mistral API connection successful


 50%|█████     | 1/2 [00:01<00:01,  1.30s/it]

Success: aave_01


100%|██████████| 2/2 [00:02<00:00,  1.39s/it]

Success: aave_02
Saved Mistral sample responses: 2 examples





## Data Collection Summary


In [None]:
# Summary of collected data
import os
import glob

print("=== Data Collection Summary ===")
print(f"Stimuli dataset: {len(stimuli)} examples")
print(f"Varieties: {list(stimuli['variety'].unique())}")
print(f"Tasks: {list(stimuli['task'].unique())}")
print()

# Check which model response files exist
response_files = glob.glob("../data/raw/*_responses.csv")
print("Collected model responses:")
for file_path in response_files:
    model_name = os.path.basename(file_path).replace("_responses.csv", "")
    try:
        df = pd.read_csv(file_path)
        success_count = df['success'].sum() if 'success' in df.columns else len(df)
        print(f"  {model_name.title()}: {len(df)} examples ({success_count} successful)")
    except Exception as e:
        print(f"  {model_name.title()}: Error reading file - {e}")

print()
print("Data collection complete! Ready for analysis in notebook 02.")
