In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
import optuna
import random
import os
import warnings
import sys

warnings.filterwarnings('ignore')

# ============================================================================
# 1. REPRODUCIBILITY SETUP (CRITICAL)
# ============================================================================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
seed_everything(SEED)

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"ðŸš€ Using Device: {device}")

# ============================================================================
# 2. PREPROCESSING
# ============================================================================
print("Loading Data...")
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Handle IDs
test_ids = df_test['founder_id'].copy()
df_train.drop('founder_id', axis=1, inplace=True)
df_test.drop('founder_id', axis=1, inplace=True)

# Target Map
df_train['retention_status'] = df_train['retention_status'].map({'Left': 1, 'Stayed': 0})
y = df_train['retention_status'].values
X = df_train.drop('retention_status', axis=1)

def clean_text(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.replace('â€™', "'").str.replace('â€˜', "'")
    return df

X = clean_text(X)
X_test = clean_text(df_test)

def process_features(df):
    df = df.copy()
    
    # 1. Binary Mapping
    bin_cols = ['working_overtime', 'remote_operations', 'leadership_scope', 'innovation_support']
    for c in bin_cols:
        df[c] = df[c].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
    
    # 2. Ratios & Interactions
    def safe_ratio(a, b): return np.where((b!=0) & (~pd.isna(b)), a/b, 0)
    
    df['funding_velocity'] = safe_ratio(df['funding_rounds_led'], df['years_since_founding'])
    size_map = {'Small': 1, 'Medium': 2, 'Large': 3, 'Unknown': 0}
    df['team_complexity'] = df['remote_operations'] * df['team_size_category'].map(size_map).fillna(0)
    df['revenue_per_year'] = safe_ratio(df['monthly_revenue_generated'], df['years_since_founding'])
    df['founder_tenure_ratio'] = safe_ratio(df['years_with_startup'], df['years_since_founding'])
    df['is_married'] = (df['personal_status'] == 'Married').astype(int)
    df['family_burden'] = df['num_dependents'].fillna(0) * df['is_married']

    # 3. Ordinal Scores
    bal_map = {'Poor':1, 'Fair':2, 'Good':3, 'Excellent':4, 'Unknown': 2}
    perf_map = {'Poor':1, 'Average':2, 'Good':3, 'Excellent':4, 'Unknown': 2}
    rep_map  = {'Poor':1, 'Fair':2, 'Good':3, 'Excellent':4, 'Unknown': 2}
    sat_map  = {'Low':1, 'Medium':2, 'High':3, 'Very High':4, 'Unknown': 2}

    df['work_pressure'] = df['working_overtime'] * (5 - df['work_life_balance_rating'].fillna('Unknown').map(bal_map))
    df['success_score'] = (df['startup_performance_rating'].map(perf_map).fillna(2) + \
                           df['startup_reputation'].map(rep_map).fillna(2) + \
                           df['venture_satisfaction'].map(sat_map).fillna(2)) / 3
    df['burnout_index'] = df['work_pressure'] * df['family_burden']
    
    # 4. Binning
    df['revenue_binned'] = pd.cut(df['monthly_revenue_generated'], bins=[-1, 5000, 8000, np.inf], labels=[0,1,2]).astype(float)
    
    # 5. Clean Missing Strings
    cat_cols = ['founder_gender', 'founder_role', 'personal_status', 'team_size_category', 'founder_visibility', 
                'education_background', 'startup_stage', 'work_life_balance_rating', 
                'startup_performance_rating', 'startup_reputation', 'venture_satisfaction']
    for col in cat_cols:
        df[col] = df[col].fillna('Unknown').astype(str)
        
    return df

print("Feature Engineering...")
X_proc = process_features(X)
X_test_proc = process_features(X_test)

# --- NN Specific Prep ---
cat_cols = ['founder_gender', 'founder_role', 'personal_status', 'team_size_category', 'founder_visibility', 
            'education_background', 'startup_stage', 'work_life_balance_rating', 
            'startup_performance_rating', 'startup_reputation', 'venture_satisfaction']
num_cols = [c for c in X_proc.columns if c not in cat_cols]

# 1. Fill Numerical NaNs
imputer = SimpleImputer(strategy='median')
X_proc[num_cols] = imputer.fit_transform(X_proc[num_cols])
X_test_proc[num_cols] = imputer.transform(X_test_proc[num_cols])

# 2. Scale Numericals
scaler = StandardScaler()
X_proc[num_cols] = scaler.fit_transform(X_proc[num_cols])
X_test_proc[num_cols] = scaler.transform(X_test_proc[num_cols])

# 3. Label Encode Categoricals for Embeddings
cat_dims = []
for col in cat_cols:
    le = LabelEncoder()
    full_list = pd.concat([X_proc[col], X_test_proc[col]], axis=0).astype(str)
    le.fit(full_list)
    
    X_proc[col] = le.transform(X_proc[col].astype(str))
    X_test_proc[col] = le.transform(X_test_proc[col].astype(str))
    
    vocab = len(le.classes_) + 1 
    emb_dim = min(50, (vocab + 1) // 2)
    cat_dims.append((vocab, emb_dim))

# Convert to Numpy
X_cat = X_proc[cat_cols].values.astype(np.int64)
X_num = X_proc[num_cols].values.astype(np.float32)
# Add Test Numpy arrays for final inference
X_test_cat = X_test_proc[cat_cols].values.astype(np.int64)
X_test_num = X_test_proc[num_cols].values.astype(np.float32)

# ============================================================================
# 3. MODEL ARCHITECTURE
# ============================================================================
class FounderDataset(Dataset):
    def __init__(self, cat_data, num_data, targets=None):
        self.cat_data = torch.tensor(cat_data, dtype=torch.long)
        self.num_data = torch.tensor(num_data, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32).unsqueeze(1) if targets is not None else None
        
    def __len__(self):
        return len(self.cat_data)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            return self.cat_data[idx], self.num_data[idx], self.targets[idx]
        return self.cat_data[idx], self.num_data[idx]

class GoatedMLP(nn.Module):
    def __init__(self, embedding_dims, n_num, dropout, l1_size, l2_size, l3_size):
        super().__init__()
        
        self.embeddings = nn.ModuleList([nn.Embedding(v, d) for v, d in embedding_dims])
        self.n_emb_out = sum(d for v, d in embedding_dims)
        self.bn_num = nn.BatchNorm1d(n_num)
        in_dim = self.n_emb_out + n_num
        
        self.layer1 = nn.Sequential(
            nn.Linear(in_dim, l1_size),
            nn.BatchNorm1d(l1_size),
            nn.SiLU(),
            nn.Dropout(dropout)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(l1_size, l2_size),
            nn.BatchNorm1d(l2_size),
            nn.SiLU(),
            nn.Dropout(dropout)
        )
        self.layer3 = nn.Sequential(
            nn.Linear(l2_size, l3_size),
            nn.BatchNorm1d(l3_size),
            nn.SiLU(),
            nn.Dropout(dropout/2)
        )
        self.head = nn.Linear(l3_size, 1)

    def forward(self, x_cat, x_num):
        emb_out = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x_emb = torch.cat(emb_out, 1)
        x_num = self.bn_num(x_num)
        x = torch.cat([x_emb, x_num], 1)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return self.head(x)

# ============================================================================
# 4. OPTUNA OBJECTIVE
# ============================================================================
def objective(trial):
    seed_everything(SEED)
    
    params = {
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'dropout': trial.suggest_float('dropout', 0.1, 0.5),
        'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True),
        'l1_size': trial.suggest_categorical('l1_size', [256, 512, 1024]),
        'l2_size': trial.suggest_categorical('l2_size', [128, 256, 512]),
        'l3_size': trial.suggest_categorical('l3_size', [64, 128, 256]),
        'batch_size': 512,
    }
    
    if params['l2_size'] > params['l1_size'] or params['l3_size'] > params['l2_size']:
        pass

    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    fold_f1_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_cat, y)):
        X_cat_tr, X_cat_val = X_cat[train_idx], X_cat[val_idx]
        X_num_tr, X_num_val = X_num[train_idx], X_num[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        train_ds = FounderDataset(X_cat_tr, X_num_tr, y_tr)
        val_ds = FounderDataset(X_cat_val, X_num_val, y_val)
        
        train_loader = DataLoader(train_ds, batch_size=params['batch_size'], shuffle=True, num_workers=0)
        val_loader = DataLoader(val_ds, batch_size=params['batch_size']*2, shuffle=False, num_workers=0)
        
        model = GoatedMLP(
            embedding_dims=cat_dims, 
            n_num=X_num.shape[1],
            dropout=params['dropout'],
            l1_size=params['l1_size'],
            l2_size=params['l2_size'],
            l3_size=params['l3_size']
        ).to(device)
        
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

        epochs = 15 
        best_val_loss = float('inf')
        
        for epoch in range(epochs):
            model.train()
            for cat, num, target in train_loader:
                cat, num, target = cat.to(device), num.to(device), target.to(device)
                optimizer.zero_grad()
                logits = model(cat, num)
                loss = criterion(logits, target)
                loss.backward()
                optimizer.step()
            scheduler.step()
            
        model.eval()
        val_preds = []
        val_targets_all = []
        with torch.no_grad():
            for cat, num, target in val_loader:
                cat, num, target = cat.to(device), num.to(device), target.to(device)
                logits = model(cat, num)
                val_preds.extend(torch.sigmoid(logits).cpu().numpy())
                val_targets_all.extend(target.cpu().numpy())
        
        best_f1_fold = 0
        preds_arr = np.array(val_preds)
        targets_arr = np.array(val_targets_all)
        for t in np.arange(0.3, 0.7, 0.05):
            p = (preds_arr >= t).astype(int)
            f1 = f1_score(targets_arr, p, average='macro')
            if f1 > best_f1_fold:
                best_f1_fold = f1
        fold_f1_scores.append(best_f1_fold)
        
    return np.mean(fold_f1_scores)

# ============================================================================
# 5. RUN OPTIMIZATION
# ============================================================================
print("\nStarting Optuna Study (Reproducible)...")

sampler = optuna.samplers.TPESampler(seed=SEED)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=20) 

print("\n" + "="*50)
print(f"âœ… BEST MACRO F1: {study.best_value:.4f}")
print("="*50)
best_params = study.best_params
print("BEST PARAMS:", best_params)

# ============================================================================
# 6. FINAL TRAINING & SUBMISSION
# ============================================================================
print("\n" + "="*50)
print("ðŸš€ STARTING FINAL TRAINING WITH BEST PARAMS")
print("="*50)

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

oof_preds = np.zeros(len(X_cat))
test_preds_accum = np.zeros(len(X_test_cat))
batch_size = 512

# Prepare Test Loader
test_ds = FounderDataset(X_test_cat, X_test_num)
test_loader = DataLoader(test_ds, batch_size=batch_size*2, shuffle=False)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_cat, y)):
    print(f"\nâš¡ FOLD {fold+1}/{n_splits}")
    
    # 1. Split
    X_cat_tr, X_cat_val = X_cat[train_idx], X_cat[val_idx]
    X_num_tr, X_num_val = X_num[train_idx], X_num[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # 2. Loaders
    train_ds = FounderDataset(X_cat_tr, X_num_tr, y_tr)
    val_ds = FounderDataset(X_cat_val, X_num_val, y_val)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size*2, shuffle=False)
    
    # 3. Init Model with BEST PARAMS
    model = GoatedMLP(
        embedding_dims=cat_dims, 
        n_num=X_num.shape[1],
        dropout=best_params['dropout'],
        l1_size=best_params['l1_size'],
        l2_size=best_params['l2_size'],
        l3_size=best_params['l3_size']
    ).to(device)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=best_params['lr'], weight_decay=best_params['weight_decay'])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

    epochs = 40 # Increased epochs for final training
    best_loss = float('inf')
    patience = 7
    counter = 0
    best_weights = None
    
    # 4. Training Loop
    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for cat, num, target in train_loader:
            cat, num, target = cat.to(device), num.to(device), target.to(device)
            optimizer.zero_grad()
            logits = model(cat, num)
            loss = criterion(logits, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for cat, num, target in val_loader:
                cat, num, target = cat.to(device), num.to(device), target.to(device)
                logits = model(cat, num)
                val_loss += criterion(logits, target).item()
        
        avg_val = val_loss / len(val_loader)
        scheduler.step(avg_val)
        
        if avg_val < best_loss:
            best_loss = avg_val
            best_weights = model.state_dict()
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"  Early stopping at epoch {epoch+1}")
                break
    
    # 5. OOF & Test Predictions
    model.load_state_dict(best_weights)
    model.eval()
    
    # OOF
    val_probs = []
    with torch.no_grad():
        for cat, num, _ in val_loader:
            cat, num = cat.to(device), num.to(device)
            logits = model(cat, num)
            val_probs.extend(torch.sigmoid(logits).cpu().numpy())
    oof_preds[val_idx] = np.array(val_probs).flatten()
    
    # Test Accumulation
    fold_test_probs = []
    with torch.no_grad():
        for cat, num in test_loader:
            cat, num = cat.to(device), num.to(device)
            logits = model(cat, num)
            fold_test_probs.extend(torch.sigmoid(logits).cpu().numpy())
    test_preds_accum += np.array(fold_test_probs).flatten() / n_splits
    
    fold_f1 = f1_score(y_val, (np.array(val_probs) >= 0.5).astype(int), average='macro')
    print(f"  Best Val Loss: {best_loss:.4f} | F1 (0.5): {fold_f1:.4f}")

# ============================================================================
# 7. SUBMISSION GENERATION
# ============================================================================
print("\nOptimizing Threshold on OOF Predictions...")
best_f1 = 0
best_thresh = 0.5
for t in np.arange(0.3, 0.7, 0.001):
    p = (oof_preds >= t).astype(int)
    f1 = f1_score(y, p, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Global OOF Macro F1: {best_f1:.4f} at Threshold: {best_thresh:.3f}")

final_preds = (test_preds_accum >= best_thresh).astype(int)

sub = pd.DataFrame({
    'founder_id': test_ids,
    'retention_status': ['Left' if p == 1 else 'Stayed' for p in final_preds]
})
sub.to_csv('submission_nn_optuna_final.csv', index=False)
print("Saved 'submission_nn_optuna_final.csv'")

ðŸš€ Using Device: cuda
Loading Data...
Feature Engineering...


[I 2025-11-23 18:19:49,529] A new study created in memory with name: no-name-0073150f-1e71-4a1f-a694-aefee775aee9



Starting Optuna Study (Reproducible)...


[I 2025-11-23 18:21:06,744] Trial 0 finished with value: 0.7486536856104717 and parameters: {'lr': 0.0005611516415334506, 'dropout': 0.4802857225639665, 'weight_decay': 0.000157029708840554, 'l1_size': 256, 'l2_size': 256, 'l3_size': 256}. Best is trial 0 with value: 0.7486536856104717.
[I 2025-11-23 18:22:29,890] Trial 1 finished with value: 0.7462181924817578 and parameters: {'lr': 0.004622589001020831, 'dropout': 0.18493564427131048, 'weight_decay': 3.5113563139704077e-06, 'l1_size': 1024, 'l2_size': 512, 'l3_size': 256}. Best is trial 0 with value: 0.7486536856104717.
[I 2025-11-23 18:23:43,758] Trial 2 finished with value: 0.7494691545783992 and parameters: {'lr': 0.000816845589476017, 'dropout': 0.41407038455720546, 'weight_decay': 3.972110727381911e-06, 'l1_size': 512, 'l2_size': 128, 'l3_size': 128}. Best is trial 2 with value: 0.7494691545783992.
[I 2025-11-23 18:24:58,971] Trial 3 finished with value: 0.7433880586874138 and parameters: {'lr': 0.0004066563313514797, 'dropout':


âœ… BEST MACRO F1: 0.7502
BEST PARAMS: {'lr': 0.001885274701655737, 'dropout': 0.49606564079850074, 'weight_decay': 1.3802926743044478e-06, 'l1_size': 512, 'l2_size': 128, 'l3_size': 128}

ðŸš€ STARTING FINAL TRAINING WITH BEST PARAMS

âš¡ FOLD 1/5
  Early stopping at epoch 19
  Best Val Loss: 0.4804 | F1 (0.5): 0.7527

âš¡ FOLD 2/5
  Early stopping at epoch 18
  Best Val Loss: 0.4795 | F1 (0.5): 0.7503

âš¡ FOLD 3/5
  Early stopping at epoch 27
  Best Val Loss: 0.4785 | F1 (0.5): 0.7476

âš¡ FOLD 4/5
  Early stopping at epoch 22
  Best Val Loss: 0.4870 | F1 (0.5): 0.7490

âš¡ FOLD 5/5
  Early stopping at epoch 26
  Best Val Loss: 0.4855 | F1 (0.5): 0.7399

Optimizing Threshold on OOF Predictions...
Global OOF Macro F1: 0.7487 at Threshold: 0.485
Saved 'submission_nn_optuna_final.csv'
