# Implementation: The "All-In" Supervised Pipeline

**Scenario**: Credit Risk Assessment.
**Goal**: Benchmark **7 Classifiers** to find the absolute best predictor.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# The Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# 1. Data Generation (Same Credit Risk Data)
np.random.seed(42)
n = 500 # Smaller dataset to speed up SVM/GridSearch demo
age = np.random.randint(18, 70, n)
income = np.random.normal(50000, 15000, n)
debt = np.random.normal(5000, 3000, n)
employment = np.random.choice(['Employed', 'Self-Employed', 'Unemployed'], n)

# Target Logic
risk_score = (debt / (income + 1000)) * 50
y = [1 if r > 1.5 else 0 for r in risk_score]

df = pd.DataFrame({'Age': age, 'Income': income, 'Debt': debt, 'Employment': employment, 'Default': y})

# 2. Preprocessing
numeric_features = ['Age', 'Income', 'Debt'] # GaussianNB needs numeric
categorical_features = ['Employment']

# Note: Standardization is CRITICAL for KNN, SVM, and Logistic Regression
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), numeric_features),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder())]), categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())]) # Placeholder

X_train, X_test, y_train, y_test = train_test_split(df.drop('Default', axis=1), df['Default'], test_size=0.2, random_state=42)

# 3. The Grand Grid Search (7 Algorithms)
param_grid = [
    # 1. Logistic Regression
    {'classifier': [LogisticRegression()], 'classifier__C': [0.1, 1, 10]},
    
    # 2. KNN (Must scale!)
    {'classifier': [KNeighborsClassifier()], 'classifier__n_neighbors': [3, 5, 7]},
    
    # 3. SVM (Requires Scaling)
    {'classifier': [SVC(probability=True)], 'classifier__kernel': ['linear', 'rbf'], 'classifier__C': [1, 10]},
    
    # 4. Naive Bayes (Fast baseline)
    {'classifier': [GaussianNB()]},
    
    # 5. Decision Tree
    {'classifier': [DecisionTreeClassifier()], 'classifier__max_depth': [3, 5, 10]},
    
    # 6. Random Forest (Robust)
    {'classifier': [RandomForestClassifier()], 'classifier__n_estimators': [50, 100]},
    
    # 7. XGBoost (Performance king)
    {'classifier': [XGBClassifier(eval_metric='logloss')], 'classifier__learning_rate': [0.1]}
]

print("Training 7 algorithms with Hyperparameter Tuning... (This might take a moment)")
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

# 4. Results
results_df = pd.DataFrame(grid.cv_results_)
results_df = results_df[['param_classifier', 'mean_test_score', 'rank_test_score']]
results_df['Algorithm'] = results_df['param_classifier'].astype(str).str.split('(').str[0]
display(results_df.sort_values('rank_test_score').head(10))

print(f"\nWinner: {grid.best_params_['classifier']}")