In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
import random
import os
import warnings

warnings.filterwarnings('ignore')

# ============================================================================
# 1. SETUP & REPRODUCIBILITY
# ============================================================================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
seed_everything(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"ðŸš€ Using Device: {device}")

# ============================================================================
# 2. PREPROCESSING (LINEAR MODEL SPECIALIZED)
# ============================================================================
print("Loading Data...")
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Handle IDs
test_ids = df_test['founder_id'].copy()
df_train.drop('founder_id', axis=1, inplace=True)
df_test.drop('founder_id', axis=1, inplace=True)

# Target Map
df_train['retention_status'] = df_train['retention_status'].map({'Left': 1, 'Stayed': 0})
y = df_train['retention_status'].values
X = df_train.drop('retention_status', axis=1)
X_test = df_test 

def process_linear(df):
    df = df.copy()
    
    # 1. Text Clean
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.replace('â€™', "'").str.replace('â€˜', "'")
        
    # 2. Binary Mapping
    bin_cols = ['working_overtime', 'remote_operations', 'leadership_scope', 'innovation_support']
    for c in bin_cols:
        df[c] = df[c].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

    # 3. Numeric Imputation (Median)
    num_cols = df.select_dtypes(include=[np.number]).columns
    for c in num_cols:
        df[c] = df[c].fillna(df[c].median())

    # 4. Feature Engineering (Ratios help Linear Models A LOT)
    def safe_div(a, b): return np.where(b > 0, a / b, 0)
    
    # Create simple ordinal map for size to use in math
    size_map = {'Small': 10, 'Medium': 50, 'Large': 200, 'Unknown': 25}
    size_approx = df['team_size_category'].map(size_map).fillna(25)
    
    df['revenue_per_employee'] = safe_div(df['monthly_revenue_generated'], size_approx)
    df['funding_per_year'] = safe_div(df['funding_rounds_led'], df['years_since_founding'])
    df['age_entry_diff'] = df['founder_age'] - df['years_with_startup']
    
    # 5. Log Transform Skewed Numericals
    df['log_revenue'] = np.log1p(df['monthly_revenue_generated'])
    
    return df

print("Feature Engineering...")
X_proc = process_linear(X)
X_test_proc = process_linear(X_test)

# --- ONE-HOT ENCODING & SCALING ---
# Linear models need One-Hot for nominals
cat_cols = ['founder_gender', 'founder_role', 'personal_status', 'team_size_category', 
            'founder_visibility', 'education_background', 'startup_stage', 
            'work_life_balance_rating', 'startup_performance_rating', 
            'startup_reputation', 'venture_satisfaction']

# Get dummies (One-Hot)
X_proc = pd.get_dummies(X_proc, columns=cat_cols, dummy_na=True)
X_test_proc = pd.get_dummies(X_test_proc, columns=cat_cols, dummy_na=True)

# Align columns (ensure test has same columns as train)
X_proc, X_test_proc = X_proc.align(X_test_proc, join='left', axis=1, fill_value=0)

# Scale Everything
scaler = StandardScaler()
X_arr = scaler.fit_transform(X_proc)
X_test_arr = scaler.transform(X_test_proc)

# Convert to Float32 for GPU
X_tensor = torch.tensor(X_arr, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_arr, dtype=torch.float32)

# ============================================================================
# 3. PYTORCH LOGISTIC REGRESSION MODEL
# ============================================================================
class LogisticRegressionGPU(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionGPU, self).__init__()
        # 1 Linear Layer + Sigmoid = Logistic Regression
        self.linear = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        return self.linear(x)

class DataHelper(Dataset):
    def __init__(self, x, y=None):
        self.x = x
        self.y = y
    def __len__(self): return len(self.x)
    def __getitem__(self, idx):
        if self.y is not None: return self.x[idx], self.y[idx]
        return self.x[idx]

# ============================================================================
# 4. FINAL TRAINING (5-FOLD CV)
# ============================================================================
# Fixed Parameters (Robust defaults)
params = {
    'lr': 0.005,
    'weight_decay': 1e-4,
    'batch_size': 2048,
    'epochs': 50
}

print(f"Starting 5-Fold Training with params: {params}")

# Train on 5-Fold Averaging for stability
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
test_preds_accum = np.zeros(len(X_test_tensor))
oof_preds = np.zeros(len(X_tensor))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_tensor, y)):
    print(f"  Fold {fold+1}/5...")
    
    # Prepare
    X_tr, X_val = X_tensor[train_idx], X_tensor[val_idx]
    y_tr, y_val = y_tensor[train_idx], y_tensor[val_idx]
    train_loader = DataLoader(DataHelper(X_tr, y_tr), batch_size=params['batch_size'], shuffle=True)
    
    # Init
    model = LogisticRegressionGPU(X_tensor.shape[1]).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    criterion = nn.BCEWithLogitsLoss()
    
    # Train
    model.train()
    for epoch in range(params['epochs']):
        for x_b, y_b in train_loader:
            x_b, y_b = x_b.to(device), y_b.to(device)
            optimizer.zero_grad()
            out = model(x_b)
            loss = criterion(out, y_b)
            loss.backward()
            optimizer.step()
            
    # Predict
    model.eval()
    with torch.no_grad():
        # OOF
        val_probs = torch.sigmoid(model(X_val.to(device))).cpu().numpy()
        oof_preds[val_idx] = val_probs.flatten()
        
        # Test
        test_out = model(X_test_tensor.to(device))
        test_preds_accum += torch.sigmoid(test_out).cpu().numpy().flatten() / 5

# ============================================================================
# 5. THRESHOLD OPTIMIZATION & SUBMISSION
# ============================================================================
# Optimize Threshold
best_f1 = 0
best_thresh = 0.5
for t in np.arange(0.3, 0.7, 0.001):
    p = (oof_preds >= t).astype(int)
    f1 = f1_score(y, p, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"\nFinal Macro F1: {best_f1:.4f} at Threshold: {best_thresh:.3f}")

final_preds = (test_preds_accum >= best_thresh).astype(int)

sub = pd.DataFrame({
    'founder_id': test_ids,
    'retention_status': ['Left' if p == 1 else 'Stayed' for p in final_preds]
})
sub.to_csv('submission_logreg_gpu.csv', index=False)
print("Saved 'submission_logreg_gpu.csv'")

ðŸš€ Using Device: cuda
Loading Data...
Feature Engineering...
Starting 5-Fold Training with params: {'lr': 0.005, 'weight_decay': 0.0001, 'batch_size': 2048, 'epochs': 50}
  Fold 1/5...
  Fold 2/5...
  Fold 3/5...
  Fold 4/5...
  Fold 5/5...

Final Macro F1: 0.7472 at Threshold: 0.500
Saved 'submission_logreg_gpu.csv'
