# Part 2: Facial Recognition System
## Formative 2 - Data Preprocessing Assignment

This notebook implements facial recognition for user authentication.

### Tasks:
1. Load and display facial images (neutral, smiling, surprised)
2. Apply image augmentations (rotation, flipping, grayscale, etc.)
3. Extract image features (embeddings, histograms)
4. Train facial recognition model
5. Evaluate model performance (Accuracy, F1-Score)
6. Demonstrate authentication system

In [None]:
# Import Required Libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Image Data Collection and Display

Each group member should provide 3 images:
- Neutral expression
- Smiling expression
- Surprised expression

In [None]:
# Directory structure
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
IMAGE_DIR = os.path.join(BASE_DIR, 'images')
MODEL_DIR = os.path.join(BASE_DIR, 'models')
FEATURE_DIR = os.path.join(BASE_DIR, 'features')

# Create subdirectories for each member and expression
# Structure: images/member_name/expression/image.jpg
members = ['member1', 'member2', 'member3']  # Replace with actual names
expressions = ['neutral', 'smiling', 'surprised']

for member in members:
    for expression in expressions:
        path = os.path.join(IMAGE_DIR, member, expression)
        os.makedirs(path, exist_ok=True)

print("Directory structure created!")
print(f"Please place images in: {IMAGE_DIR}")
print("Format: images/member_name/expression/image.jpg")

In [None]:
def load_images(image_dir):
    """
    Load all images from the directory structure.
    Returns: DataFrame with image paths, labels, and pixel data
    """
    images_data = []
    
    for member in os.listdir(image_dir):
        member_path = os.path.join(image_dir, member)
        if not os.path.isdir(member_path):
            continue
            
        for expression in os.listdir(member_path):
            expression_path = os.path.join(member_path, expression)
            if not os.path.isdir(expression_path):
                continue
                
            for img_file in os.listdir(expression_path):
                if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(expression_path, img_file)
                    
                    # Load image
                    img = cv2.imread(img_path)
                    if img is not None:
                        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                        
                        images_data.append({
                            'path': img_path,
                            'member': member,
                            'expression': expression,
                            'image': img_rgb,
                            'filename': img_file
                        })
    
    return pd.DataFrame(images_data)

# Load images
df_images = load_images(IMAGE_DIR)
print(f"Loaded {len(df_images)} images")
print(f"\nMembers: {df_images['member'].unique()}")
print(f"Expressions: {df_images['expression'].unique()}")

In [None]:
def display_sample_images(df, num_samples=3):
    """
    Display sample images for each member and expression
    """
    members = df['member'].unique()
    expressions = df['expression'].unique()
    
    fig, axes = plt.subplots(len(members), len(expressions), 
                             figsize=(15, 5*len(members)))
    
    if len(members) == 1:
        axes = axes.reshape(1, -1)
    
    for i, member in enumerate(members):
        for j, expression in enumerate(expressions):
            subset = df[(df['member'] == member) & (df['expression'] == expression)]
            
            if len(subset) > 0:
                img = subset.iloc[0]['image']
                axes[i, j].imshow(img)
                axes[i, j].set_title(f"{member} - {expression}")
                axes[i, j].axis('off')
            else:
                axes[i, j].text(0.5, 0.5, 'No Image', 
                               ha='center', va='center')
                axes[i, j].axis('off')
    
    plt.tight_layout()
    plt.show()

# Display sample images
if len(df_images) > 0:
    display_sample_images(df_images)
else:
    print("No images found. Please add images to the 'images' directory.")

## 2. Image Augmentation

Apply at least 2 augmentations per image:
- Rotation
- Horizontal flip
- Grayscale conversion
- Brightness adjustment
- Gaussian blur

In [None]:
def augment_image(img):
    """
    Apply multiple augmentations to an image.
    Returns: List of augmented images with labels
    """
    augmented = []
    
    # Original
    augmented.append(('original', img))
    
    # Rotation (15 degrees)
    height, width = img.shape[:2]
    center = (width // 2, height // 2)
    rotation_matrix = cv2.getRotationMatrix2D(center, 15, 1.0)
    rotated = cv2.warpAffine(img, rotation_matrix, (width, height))
    augmented.append(('rotated_15', rotated))
    
    # Rotation (-15 degrees)
    rotation_matrix = cv2.getRotationMatrix2D(center, -15, 1.0)
    rotated_neg = cv2.warpAffine(img, rotation_matrix, (width, height))
    augmented.append(('rotated_-15', rotated_neg))
    
    # Horizontal flip
    flipped = cv2.flip(img, 1)
    augmented.append(('flipped', flipped))
    
    # Grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    gray_rgb = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
    augmented.append(('grayscale', gray_rgb))
    
    # Brightness increase
    bright = cv2.convertScaleAbs(img, alpha=1.2, beta=30)
    augmented.append(('brightness_up', bright))
    
    # Brightness decrease
    dark = cv2.convertScaleAbs(img, alpha=0.8, beta=-30)
    augmented.append(('brightness_down', dark))
    
    # Gaussian blur
    blurred = cv2.GaussianBlur(img, (5, 5), 0)
    augmented.append(('blurred', blurred))
    
    return augmented

# Apply augmentation to all images
augmented_data = []

for idx, row in df_images.iterrows():
    augmentations = augment_image(row['image'])
    
    for aug_type, aug_img in augmentations:
        augmented_data.append({
            'member': row['member'],
            'expression': row['expression'],
            'augmentation': aug_type,
            'image': aug_img,
            'original_path': row['path']
        })

df_augmented = pd.DataFrame(augmented_data)
print(f"Total augmented images: {len(df_augmented)}")
print(f"Original images: {len(df_images)}")
print(f"Augmentations per image: {len(df_augmented) // len(df_images) if len(df_images) > 0 else 0}")

In [None]:
# Display augmentation examples
def display_augmentations(df, member_idx=0, expression='neutral'):
    """
    Display all augmentations for a single image
    """
    if len(df) == 0:
        print("No images to display")
        return
    
    members = df['member'].unique()
    if member_idx >= len(members):
        member_idx = 0
    
    member = members[member_idx]
    subset = df[(df['member'] == member) & (df['expression'] == expression)]
    
    if len(subset) == 0:
        print(f"No images found for {member} - {expression}")
        return
    
    augmentations = subset['augmentation'].unique()
    n_aug = len(augmentations)
    
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()
    
    for i, aug_type in enumerate(augmentations[:8]):
        img_data = subset[subset['augmentation'] == aug_type].iloc[0]
        axes[i].imshow(img_data['image'])
        axes[i].set_title(f"{aug_type}")
        axes[i].axis('off')
    
    # Hide unused subplots
    for i in range(n_aug, 8):
        axes[i].axis('off')
    
    plt.suptitle(f"Augmentations for {member} - {expression}", fontsize=16)
    plt.tight_layout()
    plt.show()

# Display augmentations for first member
if len(df_augmented) > 0:
    display_augmentations(df_augmented, member_idx=0, expression='neutral')
else:
    print("No augmented images to display")

## 3. Feature Extraction

Extract features from images:
- Color histograms (RGB channels)
- HOG (Histogram of Oriented Gradients)
- Statistical features (mean, std, etc.)
- Edge detection features

In [None]:
from skimage.feature import hog
from skimage import exposure

def extract_features(img, resize_shape=(128, 128)):
    """
    Extract comprehensive features from an image.
    Returns: Feature vector as numpy array
    """
    features = []
    
    # Resize image for consistency
    img_resized = cv2.resize(img, resize_shape)
    
    # 1. Color Histogram Features (RGB)
    for channel in range(3):
        hist = cv2.calcHist([img_resized], [channel], None, [32], [0, 256])
        hist = hist.flatten() / hist.sum()  # Normalize
        features.extend(hist)
    
    # 2. Statistical Features
    for channel in range(3):
        channel_data = img_resized[:, :, channel]
        features.append(np.mean(channel_data))
        features.append(np.std(channel_data))
        features.append(np.median(channel_data))
        features.append(np.min(channel_data))
        features.append(np.max(channel_data))
    
    # 3. Grayscale features
    gray = cv2.cvtColor(img_resized, cv2.COLOR_RGB2GRAY)
    
    # Edge detection (Canny)
    edges = cv2.Canny(gray, 100, 200)
    features.append(np.sum(edges > 0) / edges.size)  # Edge density
    
    # 4. HOG Features
    hog_features = hog(gray, orientations=9, pixels_per_cell=(8, 8),
                       cells_per_block=(2, 2), visualize=False)
    # Take first 100 HOG features to reduce dimensionality
    features.extend(hog_features[:100])
    
    # 5. Texture features (using Laplacian)
    laplacian = cv2.Laplacian(gray, cv2.CV_64F)
    features.append(np.var(laplacian))  # Texture variance
    
    return np.array(features)

# Extract features from all augmented images
print("Extracting features from images...")
feature_list = []

for idx, row in df_augmented.iterrows():
    features = extract_features(row['image'])
    
    feature_dict = {
        'member': row['member'],
        'expression': row['expression'],
        'augmentation': row['augmentation']
    }
    
    # Add features as separate columns
    for i, feat_val in enumerate(features):
        feature_dict[f'feature_{i}'] = feat_val
    
    feature_list.append(feature_dict)

df_features = pd.DataFrame(feature_list)
print(f"\nFeature extraction complete!")
print(f"Shape: {df_features.shape}")
print(f"Features per image: {len([col for col in df_features.columns if 'feature_' in col])}")

In [None]:
# Save features to CSV
feature_csv_path = os.path.join(FEATURE_DIR, 'image_features.csv')
df_features.to_csv(feature_csv_path, index=False)
print(f"Features saved to: {feature_csv_path}")

# Display first few rows
print("\nFirst few rows of features:")
df_features.head()

## 4. Facial Recognition Model

Train a model to recognize different team members based on their facial features.

In [None]:
# Prepare data for model training
if len(df_features) > 0:
    # Separate features and labels
    feature_columns = [col for col in df_features.columns if 'feature_' in col]
    X = df_features[feature_columns].values
    y = df_features['member'].values
    
    # Encode labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of members: {len(label_encoder.classes_)}")
    print(f"Members: {label_encoder.classes_}")
else:
    print("No features available for training")

In [None]:
# Train Random Forest Model
if len(df_features) > 0:
    print("Training Random Forest Classifier...")
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    
    rf_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred_train = rf_model.predict(X_train_scaled)
    y_pred_test = rf_model.predict(X_test_scaled)
    
    # Evaluate
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')
    
    print(f"\n--- Random Forest Results ---")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Training F1-Score: {train_f1:.4f}")
    print(f"Test F1-Score: {test_f1:.4f}")
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_test, 
                                target_names=label_encoder.classes_))

In [None]:
# Train Logistic Regression Model
if len(df_features) > 0:
    print("Training Logistic Regression...")
    lr_model = LogisticRegression(
        max_iter=1000,
        random_state=42,
        multi_class='multinomial'
    )
    
    lr_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred_train_lr = lr_model.predict(X_train_scaled)
    y_pred_test_lr = lr_model.predict(X_test_scaled)
    
    # Evaluate
    train_accuracy_lr = accuracy_score(y_train, y_pred_train_lr)
    test_accuracy_lr = accuracy_score(y_test, y_pred_test_lr)
    train_f1_lr = f1_score(y_train, y_pred_train_lr, average='weighted')
    test_f1_lr = f1_score(y_test, y_pred_test_lr, average='weighted')
    
    print(f"\n--- Logistic Regression Results ---")
    print(f"Training Accuracy: {train_accuracy_lr:.4f}")
    print(f"Test Accuracy: {test_accuracy_lr:.4f}")
    print(f"Training F1-Score: {train_f1_lr:.4f}")
    print(f"Test F1-Score: {test_f1_lr:.4f}")
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_test_lr, 
                                target_names=label_encoder.classes_))

In [None]:
# Confusion Matrix Visualization
if len(df_features) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Random Forest Confusion Matrix
    cm_rf = confusion_matrix(y_test, y_pred_test)
    sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', 
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_,
                ax=axes[0])
    axes[0].set_title('Random Forest - Confusion Matrix')
    axes[0].set_ylabel('True Label')
    axes[0].set_xlabel('Predicted Label')
    
    # Logistic Regression Confusion Matrix
    cm_lr = confusion_matrix(y_test, y_pred_test_lr)
    sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Greens',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_,
                ax=axes[1])
    axes[1].set_title('Logistic Regression - Confusion Matrix')
    axes[1].set_ylabel('True Label')
    axes[1].set_xlabel('Predicted Label')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Model Comparison
if len(df_features) > 0:
    comparison_df = pd.DataFrame({
        'Model': ['Random Forest', 'Logistic Regression'],
        'Train Accuracy': [train_accuracy, train_accuracy_lr],
        'Test Accuracy': [test_accuracy, test_accuracy_lr],
        'Train F1-Score': [train_f1, train_f1_lr],
        'Test F1-Score': [test_f1, test_f1_lr]
    })
    
    print("\n=== Model Performance Comparison ===")
    print(comparison_df.to_string(index=False))
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Accuracy comparison
    comparison_df.plot(x='Model', y=['Train Accuracy', 'Test Accuracy'], 
                       kind='bar', ax=axes[0], rot=0)
    axes[0].set_title('Accuracy Comparison')
    axes[0].set_ylabel('Score')
    axes[0].set_ylim([0, 1.1])
    axes[0].legend(['Train Accuracy', 'Test Accuracy'])
    
    # F1-Score comparison
    comparison_df.plot(x='Model', y=['Train F1-Score', 'Test F1-Score'],
                       kind='bar', ax=axes[1], rot=0, color=['green', 'orange'])
    axes[1].set_title('F1-Score Comparison')
    axes[1].set_ylabel('Score')
    axes[1].set_ylim([0, 1.1])
    axes[1].legend(['Train F1-Score', 'Test F1-Score'])
    
    plt.tight_layout()
    plt.show()

## 5. Save Models and Preprocessing Objects

In [None]:
# Save models and preprocessing objects
if len(df_features) > 0:
    # Save Random Forest model
    rf_model_path = os.path.join(MODEL_DIR, 'face_recognition_rf.pkl')
    joblib.dump(rf_model, rf_model_path)
    print(f"Random Forest model saved to: {rf_model_path}")
    
    # Save Logistic Regression model
    lr_model_path = os.path.join(MODEL_DIR, 'face_recognition_lr.pkl')
    joblib.dump(lr_model, lr_model_path)
    print(f"Logistic Regression model saved to: {lr_model_path}")
    
    # Save scaler
    scaler_path = os.path.join(MODEL_DIR, 'scaler.pkl')
    joblib.dump(scaler, scaler_path)
    print(f"Scaler saved to: {scaler_path}")
    
    # Save label encoder
    encoder_path = os.path.join(MODEL_DIR, 'label_encoder.pkl')
    joblib.dump(label_encoder, encoder_path)
    print(f"Label encoder saved to: {encoder_path}")
    
    print("\n✓ All models and preprocessing objects saved successfully!")

## 6. Facial Recognition Demo

Test the facial recognition system with new images.

In [None]:
def recognize_face(image_path, model, scaler, label_encoder, threshold=0.5):
    """
    Recognize a face from an image.
    
    Args:
        image_path: Path to the image file
        model: Trained classifier
        scaler: Feature scaler
        label_encoder: Label encoder
        threshold: Confidence threshold for recognition
    
    Returns:
        Dictionary with recognition results
    """
    # Load image
    img = cv2.imread(image_path)
    if img is None:
        return {'recognized': False, 'error': 'Could not load image'}
    
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Extract features
    features = extract_features(img_rgb)
    features_scaled = scaler.transform(features.reshape(1, -1))
    
    # Predict
    prediction = model.predict(features_scaled)[0]
    probabilities = model.predict_proba(features_scaled)[0]
    confidence = probabilities.max()
    
    # Check confidence threshold
    if confidence >= threshold:
        member_name = label_encoder.inverse_transform([prediction])[0]
        return {
            'recognized': True,
            'member': member_name,
            'confidence': confidence,
            'all_probabilities': dict(zip(label_encoder.classes_, probabilities))
        }
    else:
        return {
            'recognized': False,
            'reason': 'Low confidence',
            'confidence': confidence
        }

# Test with a sample image from test set
if len(df_features) > 0 and len(df_images) > 0:
    test_image_path = df_images.iloc[0]['path']
    
    print("Testing facial recognition...")
    result = recognize_face(test_image_path, rf_model, scaler, label_encoder)
    
    # Display result
    img = cv2.imread(test_image_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    plt.figure(figsize=(8, 6))
    plt.imshow(img_rgb)
    
    if result['recognized']:
        title = f"✓ Recognized: {result['member']}\nConfidence: {result['confidence']:.2%}"
        plt.title(title, color='green', fontsize=14, fontweight='bold')
    else:
        title = f"✗ Not Recognized\n{result.get('reason', 'Unknown')}"
        plt.title(title, color='red', fontsize=14, fontweight='bold')
    
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
    print("\nRecognition Result:")
    print(result)

## 7. Feature Importance Analysis

In [None]:
# Feature importance (Random Forest)
if len(df_features) > 0:
    feature_importance = rf_model.feature_importances_
    
    # Get top 20 most important features
    top_n = 20
    indices = np.argsort(feature_importance)[-top_n:]
    
    plt.figure(figsize=(10, 8))
    plt.barh(range(top_n), feature_importance[indices])
    plt.yticks(range(top_n), [f'Feature {i}' for i in indices])
    plt.xlabel('Importance')
    plt.title(f'Top {top_n} Most Important Features')
    plt.tight_layout()
    plt.show()

## Summary

### Completed Tasks:
1. ✓ Image data collection and display (3 expressions per member)
2. ✓ Image augmentation (8 augmentations per image)
3. ✓ Feature extraction (histograms, HOG, statistical features)
4. ✓ Saved features to `image_features.csv`
5. ✓ Trained facial recognition models (Random Forest & Logistic Regression)
6. ✓ Model evaluation (Accuracy, F1-Score)
7. ✓ Saved trained models for deployment
8. ✓ Created recognition demo function

### Next Steps:
- Add your team members' facial images to the `images` folder
- Re-run the notebook to train on actual data
- Integrate with voice verification and product recommendation models
- Create command-line demo application