In [5]:
import numpy as np
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Importing and loading the data, into a data frame
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
X, y = lfw_people.data, lfw_people.target
target_names = lfw_people.target_names

# Splitting the data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Apply PCA transformation separately
pca = PCA(n_components=150, whiten=True, svd_solver='randomized', random_state=42)
# Fit PCA on the training set, transform both training and test sets
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Models work on PCA-transformed data (no pipelines needed)
# Base model is now SVC ONLY
base_svm_for_boosting = SVC(class_weight='balanced', random_state=42, probability=True)

# Simple SVC is used for bagging
bagging_model = BaggingClassifier(estimator=base_svm_for_boosting, n_estimators=10, random_state=42)

# Boosting will also use the simple SVC (accepts sample_weight)
boosting_model = AdaBoostClassifier(estimator=base_svm_for_boosting, n_estimators=10, random_state=42)

# no pipelines needed for stacking
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('lr', LogisticRegression(random_state=42))]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=SVC(class_weight='balanced', random_state=42))


# Train and Evaluate
# Standalone SVM comparison (won't get used inside the ensembles)
base_svm_standalone = SVC(class_weight='balanced', random_state=42, probability=True)

models = {
    'Base SVM': base_svm_standalone,
    'Bagging': bagging_model,
    'Boosting': boosting_model,
    'Stacking': stacking_model
}

for name, model in models.items():
    print(f"\n--- {name} Results ---")
    # Train on the PCA-transformed training data
    model.fit(X_train_pca, y_train)
    # Predict on the PCA-transformed test data
    y_pred = model.predict(X_test_pca)
    print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))


--- Base SVM Results ---
                   precision    recall  f1-score   support

     Ariel Sharon       1.00      0.54      0.70        13
     Colin Powell       0.66      0.98      0.79        60
  Donald Rumsfeld       0.89      0.59      0.71        27
    George W Bush       0.89      0.92      0.91       146
Gerhard Schroeder       0.95      0.76      0.84        25
      Hugo Chavez       1.00      0.47      0.64        15
       Tony Blair       0.97      0.81      0.88        36

         accuracy                           0.84       322
        macro avg       0.91      0.72      0.78       322
     weighted avg       0.87      0.84      0.84       322


--- Bagging Results ---
                   precision    recall  f1-score   support

     Ariel Sharon       0.67      0.77      0.71        13
     Colin Powell       0.82      0.90      0.86        60
  Donald Rumsfeld       0.86      0.70      0.78        27
    George W Bush       0.93      0.94      0.93       146
G