In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING) # Keep output clean

# ============================================================================
# 1. DATA PREP (Reuse the robust pipeline)
# ============================================================================
print("Loading Data...")
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Clean Text
def clean_text(df):
    cols = df.select_dtypes(include=['object']).columns
    for col in cols:
        df[col] = df[col].str.replace('’', "'").str.replace('‘', "'")
    return df

df_train = clean_text(df_train)
df_test = clean_text(df_test)

# Handle IDs & Target
test_ids = df_test['founder_id'].copy()
df_train.drop('founder_id', axis=1, inplace=True)
df_test.drop('founder_id', axis=1, inplace=True)
df_train['retention_status'] = df_train['retention_status'].map({'Left': 1, 'Stayed': 0}).fillna(0)
y = df_train['retention_status']
df_train.drop('retention_status', axis=1, inplace=True)

# Feature Engineering
def process_features(df):
    df = df.copy()
    # Binary
    for c in ['working_overtime', 'remote_operations', 'leadership_scope', 'innovation_support']:
        df[c] = df[c].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
    
    def safe_ratio(a, b): return np.where((b!=0) & (~pd.isna(b)), a/b, 0)
    
    # Ratios & Interactions
    df['funding_velocity'] = safe_ratio(df['funding_rounds_led'], df['years_since_founding'])
    size_map = {'Small': 1, 'Medium': 2, 'Large': 3, 'Unknown': 0}
    df['team_complexity'] = df['remote_operations'] * df['team_size_category'].map(size_map).fillna(0)
    df['revenue_per_year'] = safe_ratio(df['monthly_revenue_generated'], df['years_since_founding'])
    df['founder_tenure_ratio'] = safe_ratio(df['years_with_startup'], df['years_since_founding'])
    df['is_married'] = (df['personal_status'] == 'Married').astype(int)
    df['family_burden'] = df['num_dependents'].fillna(0) * df['is_married']

    # Ordinal Maps
    bal_map = {'Poor':1, 'Fair':2, 'Good':3, 'Excellent':4, 'Unknown': 2}
    perf_map = {'Poor':1, 'Average':2, 'Good':3, 'Excellent':4, 'Unknown': 2}
    rep_map  = {'Poor':1, 'Fair':2, 'Good':3, 'Excellent':4, 'Unknown': 2}
    sat_map  = {'Low':1, 'Medium':2, 'High':3, 'Very High':4, 'Unknown': 2}

    # Mappings
    df['work_pressure'] = df['working_overtime'] * (5 - df['work_life_balance_rating'].fillna('Unknown').map(bal_map))
    
    # Super Features (Success, Stage-Tenure, Burnout)
    df['success_score'] = (df['startup_performance_rating'].map(perf_map).fillna(2) + \
                           df['startup_reputation'].map(rep_map).fillna(2) + \
                           df['venture_satisfaction'].map(sat_map).fillna(2)) / 3
    df['stage_tenure'] = df['startup_stage'].map({'Entry':1, 'Mid':2, 'Senior':3, 'Unknown':0}).fillna(0) * df['founder_tenure_ratio']
    df['burnout_index'] = df['work_pressure'] * df['family_burden']

    # Fill Categoricals for CatBoost
    nom_cols = ['founder_gender', 'founder_role', 'personal_status', 'team_size_category', 'founder_visibility', 
                'education_background', 'startup_stage', 'work_life_balance_rating', 
                'startup_performance_rating', 'startup_reputation', 'venture_satisfaction']
    for col in nom_cols:
        df[col] = df[col].fillna('Unknown').astype(str)
        
    # Binning
    df['revenue_binned'] = pd.cut(df['monthly_revenue_generated'], bins=[-1, 5000, 8000, np.inf], labels=[0,1,2]).astype(float)
    
    return df

print("Processing Features...")
X = process_features(df_train)
X_test_final = process_features(df_test)

# Outliers & Impute
num_cols = X.select_dtypes(include=[np.number]).columns
for col in ['monthly_revenue_generated', 'distance_from_investor_hub']:
    up = X[col].quantile(0.99)
    X[col] = np.clip(X[col], None, up)
    X_test_final[col] = np.clip(X_test_final[col], None, up)

imputer = SimpleImputer(strategy='median')
X[num_cols] = imputer.fit_transform(X[num_cols])
X_test_final[num_cols] = imputer.transform(X_test_final[num_cols])

# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

cat_features = ['founder_gender', 'founder_role', 'personal_status', 'team_size_category', 'founder_visibility', 
                'education_background', 'startup_stage', 'work_life_balance_rating', 
                'startup_performance_rating', 'startup_reputation', 'venture_satisfaction']

# ============================================================================
# 2. OPTUNA OPTIMIZATION
# ============================================================================
def objective(trial):
    # The Search Space
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': 254,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'cat_features': cat_features,
        'verbose': False,
        'random_seed': 42,
        'early_stopping_rounds': 50
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)
    return auc

print("\nStarting Optuna Optimization (100 Trials)...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(f"\nBest AUC: {study.best_value:.4f}")
print("Best Params:", study.best_params)

# ============================================================================
# 3. FINAL TRAINING & SUBMISSION
# ============================================================================
print("\nTraining Final Model with Best Params...")
best_params = study.best_params
best_params['cat_features'] = cat_features
best_params['verbose'] = 200
best_params['eval_metric'] = 'AUC'
best_params['early_stopping_rounds'] = 50

final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train, y_train, eval_set=(X_val, y_val))

# Threshold Tuning
val_probs = final_model.predict_proba(X_val)[:, 1]
best_acc = 0
best_t = 0.5
for t in np.arange(0.30, 0.70, 0.01):
    acc = accuracy_score(y_val, (val_probs >= t).astype(int))
    if acc > best_acc:
        best_acc = acc
        best_t = t

print(f"Optimized Threshold: {best_t:.2f} (Val Acc: {best_acc:.4f})")

# Predict
test_probs = final_model.predict_proba(X_test_final)[:, 1]
test_preds = (test_probs >= best_t).astype(int)

sub = pd.DataFrame({
    'founder_id': test_ids,
    'retention_status': ['Left' if p == 1 else 'Stayed' for p in test_preds]
})
sub.to_csv('submission_optuna.csv', index=False)
print("Saved 'submission_optuna.csv'")

Loading Data...
Processing Features...

Starting Optuna Optimization (20 Trials)...

Best AUC: 0.8480
Best Params: {'iterations': 1413, 'learning_rate': 0.01710412620510586, 'depth': 4, 'l2_leaf_reg': 6, 'random_strength': 0.3970029344689633, 'bagging_temperature': 0.13502716667068476}

Training Final Model with Best Params...
0:	test: 0.7546617	best: 0.7546617 (0)	total: 58.9ms	remaining: 1m 23s
200:	test: 0.8395654	best: 0.8395654 (200)	total: 12.4s	remaining: 1m 14s
400:	test: 0.8457705	best: 0.8457705 (400)	total: 25.9s	remaining: 1m 5s
600:	test: 0.8472586	best: 0.8472586 (600)	total: 40.8s	remaining: 55.2s
800:	test: 0.8478862	best: 0.8478917 (791)	total: 55.4s	remaining: 42.3s
1000:	test: 0.8479816	best: 0.8480042 (953)	total: 1m 9s	remaining: 28.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8480042172
bestIteration = 953

Shrink model to first 954 iterations.
Optimized Threshold: 0.47 (Val Acc: 0.7544)
Saved 'submission_optuna.csv'


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings

warnings.filterwarnings('ignore')

# ============================================================================
# 1. DATA PREP
# ============================================================================
print("Loading Data...")
try:
    df_train = pd.read_csv('train.csv')
    df_test = pd.read_csv('test.csv')
except FileNotFoundError:
    print("Files not found, generating dummy data...")
    # Dummy data generator for demonstration
    data = {
        'founder_id': range(1000),
        'retention_status': np.random.choice(['Left', 'Stayed'], 1000),
        'working_overtime': np.random.choice(['Yes', 'No'], 1000),
        'remote_operations': np.random.choice(['Yes', 'No'], 1000),
        'monthly_revenue_generated': np.random.rand(1000) * 10000,
        'years_since_founding': np.random.rand(1000) * 10,
        'team_size_category': np.random.choice(['Small', 'Medium', 'Large'], 1000),
        # Add minimal required columns for script to run if files missing
    }
    df_train = pd.DataFrame(data)
    df_test = pd.DataFrame(data).drop('retention_status', axis=1)

# Clean Text
def clean_text(df):
    cols = df.select_dtypes(include=['object']).columns
    for col in cols:
        df[col] = df[col].str.replace('’', "'").str.replace('‘', "'")
    return df

df_train = clean_text(df_train)
df_test = clean_text(df_test)

# Handle IDs & Target
test_ids = df_test['founder_id'].copy()
df_train.drop('founder_id', axis=1, inplace=True)
df_test.drop('founder_id', axis=1, inplace=True)

# Map Target
if 'retention_status' in df_train.columns:
    df_train['retention_status'] = df_train['retention_status'].map({'Left': 1, 'Stayed': 0}).fillna(0)
    y = df_train['retention_status']
    df_train.drop('retention_status', axis=1, inplace=True)
else:
    y = np.zeros(len(df_train))

# Feature Engineering
def process_features(df):
    df = df.copy()
    # Binary
    for c in ['working_overtime', 'remote_operations', 'leadership_scope', 'innovation_support']:
        if c in df.columns:
            df[c] = df[c].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
    
    # Ratios & Interactions (Simplified)
    def safe_ratio(a, b): return np.where((b!=0) & (~pd.isna(b)), a/b, 0)
    
    if 'funding_rounds_led' in df.columns:
        df['funding_velocity'] = safe_ratio(df['funding_rounds_led'], df['years_since_founding'])
    
    size_map = {'Small': 1, 'Medium': 2, 'Large': 3, 'Unknown': 0}
    if 'team_size_category' in df.columns:
        df['team_complexity'] = df['remote_operations'] * df['team_size_category'].map(size_map).fillna(0)
    
    # Fill Categoricals
    nom_cols = ['founder_gender', 'founder_role', 'personal_status', 'team_size_category', 'founder_visibility', 
                'education_background', 'startup_stage', 'work_life_balance_rating', 
                'startup_performance_rating', 'startup_reputation', 'venture_satisfaction']
    
    for col in nom_cols:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown').astype(str)
            
    return df

print("Processing Features...")
X = process_features(df_train)
X_test_final = process_features(df_test)

# Impute Numerics
num_cols = X.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
X[num_cols] = imputer.fit_transform(X[num_cols])
X_test_final[num_cols] = imputer.transform(X_test_final[num_cols])

# ============================================================================
# SVM PREP
# ============================================================================
print("Encoding and Scaling...")

# 1. One-Hot Encoding
n_train = len(X)
combined = pd.concat([X, X_test_final], axis=0)
combined = pd.get_dummies(combined, drop_first=True)

X = combined.iloc[:n_train]
X_test_final = combined.iloc[n_train:]

# 2. Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_test_final = pd.DataFrame(scaler.transform(X_test_final), columns=X_test_final.columns)

# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ============================================================================
# 2. FAST TRAINING (LinearSVC + 20% Data)
# ============================================================================
print("\nSubsampling 20% of training data for speed...")

# Sample 20% of indices
train_subset_idx = np.random.choice(X_train.index, size=int(len(X_train) * 0.20), replace=False)
X_train_sub = X_train.loc[train_subset_idx]
y_train_sub = y_train.loc[train_subset_idx]

print(f"Training on {len(X_train_sub)} rows (subset) instead of {len(X_train)}...")

# Use LinearSVC wrapped in CalibratedClassifierCV
# LinearSVC is much faster than SVC(kernel='rbf')
# CalibratedClassifierCV allows us to still get .predict_proba()
linear_svc = LinearSVC(dual=False, random_state=42, C=1.0)
model = CalibratedClassifierCV(linear_svc, cv=3) 

model.fit(X_train_sub, y_train_sub)

# Evaluate
preds_prob = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, preds_prob)
print(f"Validation AUC: {auc_score:.4f}")

# ============================================================================
# 3. SUBMISSION
# ============================================================================
print("\nTuning Threshold...")
best_acc = 0
best_t = 0.5
for t in np.arange(0.30, 0.70, 0.01):
    acc = accuracy_score(y_val, (preds_prob >= t).astype(int))
    if acc > best_acc:
        best_acc = acc
        best_t = t

print(f"Optimized Threshold: {best_t:.2f} (Val Acc: {best_acc:.4f})")

# Predict
test_probs = model.predict_proba(X_test_final)[:, 1]
test_preds = (test_probs >= best_t).astype(int)

sub = pd.DataFrame({
    'founder_id': test_ids,
    'retention_status': ['Left' if p == 1 else 'Stayed' for p in test_preds]
})
sub.to_csv('submission_fast_svm.csv', index=False)
print("Saved 'submission_fast_svm.csv'")

Loading Data...
Processing Features...
Encoding and Scaling...

Subsampling 20% of training data for speed...
Training on 9537 rows (subset) instead of 47688...
Validation AUC: 0.8373

Tuning Threshold...
Optimized Threshold: 0.43 (Val Acc: 0.7445)
Saved 'submission_fast_svm.csv'
