# 🌍 ISB Translation (Clean Version)
This notebook translates ISB scraped data from German to English using DeepL API.

In [None]:
# %%
# Install required packages
!pip install -q deepl tqdm python-dotenv


In [None]:
# %%
import pandas as pd
import os
import time
from tqdm.notebook import tqdm
import deepl
from dotenv import load_dotenv


In [None]:
# %%
# Load API key from .env file
load_dotenv()
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")

if not DEEPL_API_KEY:
    raise ValueError("DeepL API key not found. Please create a .env file with DEEPL_API_KEY=your-key")



In [None]:
# Initialize DeepL translator
translator = deepl.Translator(DEEPL_API_KEY)



In [None]:
# %%
def translate_text(text, target_lang="EN-US"):
    """Translate text with retry logic and proper error handling"""
    if pd.isna(text) or not isinstance(text, str) or not text.strip():
        return text
    
    for attempt in range(3):
        try:
            result = translator.translate_text(text, target_lang=target_lang)
            return result.text
        except deepl.exceptions.QuotaExceededException:
            print("⚠️ DeepL quota exceeded! Translation stopped.")
            return text
        except deepl.exceptions.TooManyRequestsException:
            wait_time = 5 * (attempt + 1)  # Progressive backoff
            print(f"⏳ Rate limit hit. Waiting {wait_time}s...")
            time.sleep(wait_time)
        except Exception as e:
            print(f"⚠️ Error translating: {str(e)[:100]}")
            time.sleep(3)
    
    # Return original if all retries failed
    return text



In [None]:
# %%
# Load German scraped CSV
df = pd.read_csv("data/funding-isb.csv")
print(f"📊 Loaded {len(df)} funding entries")



In [None]:
# %%
# Translate main text columns
translated_df = df.copy()
columns_to_translate = ["name", "description", "eligibility", "amount", "procedure"]

for col in columns_to_translate:
    if col not in df.columns:
        print(f"⚠️ Column {col} not found, skipping")
        continue
        
    print(f"🔤 Translating column: {col}")
    
    # Only translate non-null values
    mask = ~df[col].isna()
    total = mask.sum()
    
    # Create progress bar for this column
    with tqdm(total=total) as pbar:
        for i, idx in enumerate(df[mask].index):
            translated_df.loc[idx, col] = translate_text(df.loc[idx, col])
            pbar.update(1)
            
            # Add small delay every 5 translations to avoid rate limits
            if i % 5 == 0 and i > 0:
                time.sleep(0.5)



In [None]:
# %%
# Translate contact roles (only the part before first '|')
def translate_contact_role(contact):
    """Translate only the role part of contact information"""
    if pd.isna(contact) or not isinstance(contact, str) or " | " not in contact:
        return contact
    
    try:
        parts = contact.split(" | ", 2)
        if len(parts) >= 2:
            role = parts[0]
            rest = " | ".join(parts[1:])
            role_translated = translate_text(role)
            return f"{role_translated} | {rest}"
        return contact
    except Exception:
        return contact



In [None]:
# %%
# Translate contact roles with progress tracking
if "contact" in df.columns:
    print("🔤 Translating contact roles...")
    
    # Only translate non-null contacts
    mask = ~df["contact"].isna()
    contacts_to_translate = df.loc[mask, "contact"]
    
    with tqdm(total=len(contacts_to_translate)) as pbar:
        for idx in contacts_to_translate.index:
            translated_df.loc[idx, "contact"] = translate_contact_role(df.loc[idx, "contact"])
            pbar.update(1)
            time.sleep(0.2)  # Small delay to avoid rate limits



In [None]:
# %%
# Save translated output
output_file = "data/funding_isb_english.csv"
translated_df.to_csv(output_file, index=False)
print(f"✅ Translation complete. File saved as {output_file}")



In [None]:
# %%
# Display sample of translated data
translated_df[["name", "description"]].head(3)
