<a href="https://colab.research.google.com/github/MikeManzi/multimodel-data-preprocessing/blob/main/Face_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, classification_report
from imblearn.over_sampling import SMOTE


In [47]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder # Import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, classification_report
from imblearn.over_sampling import SMOTE

class FacialRecognitionModel:
    """
    Improved Facial Recognition Model for user authentication
    with PCA, feature selection, and hybrid model selection.
    """

    def __init__(self, model_type='hybrid'):
        self.model_type = model_type
        self.model = None
        self.scaler = StandardScaler()
        self.pca = None
        self.selector = None
        self.label_encoder = LabelEncoder() # Initialize LabelEncoder

    def preprocess_data(self, X, y):
        print(f"\nLoaded dataset with {X.shape[0]} rows and {X.shape[1]} columns.")

        # Encode string labels to numerical labels
        y_encoded = self.label_encoder.fit_transform(y)

        # Handle class imbalance
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X, y_encoded) # Use encoded y
        print(f"Balanced dataset: {X_res.shape[0]} samples after SMOTE.")

        # Scale features
        X_scaled = self.scaler.fit_transform(X_res)

        # Select top K features
        print("Applying SelectKBest feature selection...")
        self.selector = SelectKBest(score_func=f_classif, k=min(300, X_scaled.shape[1]))
        X_selected = self.selector.fit_transform(X_scaled, y_res) # Use encoded y

        # Apply PCA to reduce noise and redundancy
        print("Applying PCA dimensionality reduction...")
        self.pca = PCA(n_components=0.95, random_state=42)
        X_reduced = self.pca.fit_transform(X_selected)

        return X_reduced, y_res

    def train(self, X, y):
        X_proc, y_proc = self.preprocess_data(X, y)
        X_train, X_test, y_train, y_test = train_test_split(
            X_proc, y_proc, test_size=0.2, random_state=42, stratify=y_proc
        )

        print("\n" + "="*60)
        print(f"Training Facial Recognition Model ({self.model_type})")
        print("="*60)

        if self.model_type == 'hybrid':
            # Compare RandomForest and XGBoost
            rf = RandomForestClassifier(random_state=42)
            xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

            rf_cv = np.mean(cross_val_score(rf, X_train, y_train, cv=5))
            xgb_cv = np.mean(cross_val_score(xgb, X_train, y_train, cv=5))

            print(f"RF CV Accuracy: {rf_cv:.4f} | XGB CV Accuracy: {xgb_cv:.4f}")
            best_model = rf if rf_cv >= xgb_cv else xgb

            # Fine-tune RandomForest if chosen
            if isinstance(best_model, RandomForestClassifier):
                print("Running GridSearchCV for RandomForest tuning...")
                param_grid = {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [10, 20, None],
                    'min_samples_split': [2, 5, 10]
                }
                grid = GridSearchCV(best_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
                grid.fit(X_train, y_train)
                self.model = grid.best_estimator_
                print(f"Best RF Params: {grid.best_params_}")
            else:
                self.model = best_model.fit(X_train, y_train)

        else:
            if self.model_type == 'random_forest':
                self.model = RandomForestClassifier(random_state=42).fit(X_train, y_train)
            elif self.model_type == 'xgboost':
                self.model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42).fit(X_train, y_train)
            else:
                raise ValueError("Unsupported model type")

        y_pred = self.model.predict(X_test)
        y_prob = self.model.predict_proba(X_test)
        self.evaluate_model(y_test, y_pred, y_prob)
        joblib.dump(self, 'facial_recognition_model.pkl')
        print("Model saved to facial_recognition_model.pkl")

    def evaluate_model(self, y_true, y_pred, y_prob):
        print("\n" + "="*60)
        print("MODEL EVALUATION RESULTS")
        print("="*60)
        # Convert numerical labels back to original strings for report
        y_true_labels = self.label_encoder.inverse_transform(y_true)
        y_pred_labels = self.label_encoder.inverse_transform(y_pred)

        print("Accuracy: ", round(accuracy_score(y_true_labels, y_pred_labels), 4))
        print("F1-Score: ", round(f1_score(y_true_labels, y_pred_labels, average='weighted'), 4))
        print("Log Loss: ", round(log_loss(y_true_labels, y_prob), 4))
        try:
            roc_auc = roc_auc_score(pd.get_dummies(y_true_labels), y_prob, multi_class='ovr')
            print("ROC-AUC:  ", round(roc_auc, 4))
        except:
            print("ROC-AUC:  (not available for single-class test set)")
        print("\nClassification Report:\n", classification_report(y_true_labels, y_pred_labels))

    def predict(self, X_new):
        X_scaled = self.scaler.transform(X_new)
        X_selected = self.selector.transform(X_scaled)
        X_reduced = self.pca.transform(X_selected)
        probs = self.model.predict_proba(X_reduced)
        preds = self.model.predict(X_reduced)
        conf = np.max(probs, axis=1)

        # Convert numerical predictions back to original string labels
        predicted_labels = self.label_encoder.inverse_transform(preds)
        return predicted_labels, conf

In [42]:
data = pd.read_csv("image_features.csv")
print(data.columns)


Index(['filename', 'augmentation', 'width', 'height', 'feature_0', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5',
       ...
       'feature_1802', 'feature_1803', 'feature_1804', 'feature_1805',
       'feature_1806', 'feature_1807', 'feature_1808', 'feature_1809',
       'feature_1810', 'feature_1811'],
      dtype='object', length=1816)


In [44]:
if __name__ == "__main__":
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder

    # Load dataset
    data = pd.read_csv("image_features.csv")

    # Extract user identity from filename
    data['user_id'] = data['filename'].apply(lambda x: x.split('-')[0])

    # Features
    feature_cols = [col for col in data.columns if col.startswith('feature_')]
    X = data[feature_cols]

    y = data['user_id']
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Initialize and train model
    model = FacialRecognitionModel(model_type='hybrid')
    model.train(X, y_encoded)

    # Predict on first 5 samples
    print("\nMaking predictions on first 5 samples...")
    sample = X.head(5)
    preds, confs = model.predict(sample)

    # Decode predictions back to original user names
    preds_labels = le.inverse_transform(preds)

    for i, (p, c) in enumerate(zip(preds_labels, confs)):
        print(f"Sample {i+1}: Predicted User ‚Üí {p} | Confidence: {c:.2f}")



Loaded dataset with 165 rows and 1812 columns.
Balanced dataset: 180 samples after SMOTE.
Applying SelectKBest feature selection...
Applying PCA dimensionality reduction...

Training Facial Recognition Model (hybrid)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


RF CV Accuracy: 0.9443 | XGB CV Accuracy: 0.9443


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



MODEL EVALUATION RESULTS
Accuracy:  0.9167
F1-Score:  0.9166
Log Loss:  0.1976
ROC-AUC:   0.9928

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       1.00      0.89      0.94         9
           2       0.90      1.00      0.95         9
           3       0.89      0.89      0.89         9

    accuracy                           0.92        36
   macro avg       0.92      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36

‚úÖ Model saved to facial_recognition_model.pkl

üîç Making predictions on first 5 samples...
Sample 1: Predicted User ‚Üí Best | Confidence: 1.00
Sample 2: Predicted User ‚Üí Best | Confidence: 0.99
Sample 3: Predicted User ‚Üí Best | Confidence: 0.99
Sample 4: Predicted User ‚Üí Best | Confidence: 0.99
Sample 5: Predicted User ‚Üí Best | Confidence: 0.96
