In [3]:
# Block 1: Data Loading and Initial Setup
import pandas as pd
from sklearn.model_selection import train_test_split

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 
    'marital_status', 'occupation', 'relationship', 'race', 
    'sex', 'capital_gain', 'capital_loss', 
    'hours_per_week', 'native_country', 'income'
]
data = pd.read_csv(url, header=None, names=column_names, na_values='?')

print("Initial Data Shape:", data.shape)
data.dropna(inplace=True)
print("After NA removal:", data.shape)

# Block 2: Feature-Target Split
X = data.drop('income', axis=1)
y = (data['income'] == ' >50K').astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("Train/Test Split Complete")

# Block 3: Preprocessing Pipeline Creation
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Categorical and numerical columns
cat_cols = ['workclass', 'education', 'marital_status', 
            'occupation', 'relationship', 'race', 'sex', 
            'native_country']
num_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 
            'capital_loss', 'hours_per_week']

# Preprocessing for numerical data: use SimpleImputer (if needed)
# Numerical imputation (uncomment if data has nans)
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data: one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Block 4: Model Pipeline with AdaBoost
# Modern version (scikit-learn >=0.24)
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=2),
        n_estimators=100,
        learning_rate=0.1,
        random_state=42
    ))
])

# Training Phase
print("Training Model...")
model.fit(X_train, y_train)
print("Training Complete")

# Block 5: Prediction and Metrics
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, confusion_matrix,
    roc_curve, classification_report
)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Performance Reports
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_proba))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Block 6: Confusion Matrix Visualization
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['<=50K', '>50K'],
            yticklabels=['<=50K', '>50K'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Block 7: ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2,
         label=f'ROC curve (AUC = {roc_auc_score(y_test, y_proba):.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Block 8: Feature Importance Visualization
# Extract from base estimator
estimator = model.named_steps['classifier']
importances = estimator.feature_importances_
features = preprocessor.get_feature_names_out()

# Sort features by importance
sorted_idx = importances.argsort()
plt.figure(figsize=(12, 8))
plt.barh(range(len(importances)), importances[sorted_idx])
plt.yticks(range(len(importances)), features[sorted_idx])
plt.xlabel('Feature Importance Score')
plt.title('Feature Importance for Income Prediction')
plt.tight_layout()
plt.show()

# Block 9: Learning Curve for Model Complexity
# Alternative learning curve (requires manually tuning parameters)
# This shows how model performance changes with training size
from sklearn.model_selection import learning_curve

train_sizes, train_scores, val_scores = learning_curve(
    model,
    X, y,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5,
    scoring='f1'
)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color='yellowgreen',
         label='Training Score')
plt.plot(train_sizes, np.mean(val_scores, axis=1), 'o-', color='navy',
         label='Validation Score')
plt.xlabel('Training Examples')
plt.ylabel('F1 Score')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show()

print("All visualizations complete.")

Initial Data Shape: (32561, 15)
After NA removal: (32561, 15)
Train/Test Split Complete


TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'