# Imports

In [2]:
import os
import cv2
import numpy as np
import pywt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import svm
from sklearn.pipeline import make_pipeline
import pandas as pd

#Haar Cascade Classifiers

In [3]:
face_cascade = cv2.CascadeClassifier('/content/drive/MyDrive/opencv/haarcascades/haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier('/content/drive/MyDrive/opencv/haarcascades/haarcascade_eye.xml')

#Wavelet Transform Function

In [4]:
def w2d(img, mode='haar', level=1):
    imArray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    imArray = np.float32(imArray)
    imArray /= 255
    coeffs = pywt.wavedec2(imArray, mode, level=level)
    coeffs_H = list(coeffs)
    coeffs_H[0] *= 0
    imArray_H = pywt.waverec2(coeffs_H, mode)
    imArray_H *= 255
    imArray_H = np.uint8(imArray_H)
    return imArray_H

# Face and Eye Detector

In [5]:
def get_cropped_image_if_2_eyes(image_path):
    img = cv2.imread(image_path)
    if img is None:
        return None
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    for (x, y, w, h) in faces:
        roi_gray = gray[y:y + h, x:x + w]
        roi_color = img[y:y + h, x:x + w]
        eyes = eye_cascade.detectMultiScale(roi_gray)
        if len(eyes) >= 2:
            return roi_color
    return None


#Crop and Save Valid Face Images


In [6]:
path_to_data = "/content/drive/MyDrive/celebrity_dataset_final"
path_to_cr_data = os.path.join(path_to_data, "cropped")

#Collect image directories
img_dirs = [entry.path for entry in os.scandir(path_to_data) if entry.is_dir()]

#Clear cropped directory if it exists
import shutil
if os.path.exists(path_to_cr_data):
    shutil.rmtree(path_to_cr_data)
os.makedirs(path_to_cr_data)

In [7]:
cropped_image_dirs = []
celebrity_file_names_dict = {}

for img_dir in img_dirs:
    count = 1
    celebrity_name = os.path.basename(img_dir)
    celebrity_file_names_dict[celebrity_name] = []

    for entry in os.scandir(img_dir):
        if entry.is_file() and entry.name.lower().endswith(('.jpg', '.jpeg', '.png')):
            roi_color = get_cropped_image_if_2_eyes(entry.path)

            if roi_color is not None:
                cropped_folder = os.path.join(path_to_cr_data, celebrity_name)

                if not os.path.exists(cropped_folder):
                    os.makedirs(cropped_folder)
                    cropped_image_dirs.append(cropped_folder)
                    print(" Generating cropped images in folder:", cropped_folder)

                cropped_file_name = f"{celebrity_name}{count}.png"
                cropped_file_path = os.path.join(cropped_folder, cropped_file_name)

                cv2.imwrite(cropped_file_path, roi_color)
                celebrity_file_names_dict[celebrity_name].append(cropped_file_path)
                count += 1


 Generating cropped images in folder: /content/drive/MyDrive/celebrity_dataset_final/cropped/Allyson_Felix_athletics
 Generating cropped images in folder: /content/drive/MyDrive/celebrity_dataset_final/cropped/Anthony_Davis_basketball
 Generating cropped images in folder: /content/drive/MyDrive/celebrity_dataset_final/cropped/Antoine_Griezmann_football
 Generating cropped images in folder: /content/drive/MyDrive/celebrity_dataset_final/cropped/Aryna_Sabalenka_tennis
 Generating cropped images in folder: /content/drive/MyDrive/celebrity_dataset_final/cropped/Babar_Azam_cricket
 Generating cropped images in folder: /content/drive/MyDrive/celebrity_dataset_final/cropped/Ben_Stokes_cricket
 Generating cropped images in folder: /content/drive/MyDrive/celebrity_dataset_final/cropped/Carlos_Alcaraz_tennis
 Generating cropped images in folder: /content/drive/MyDrive/celebrity_dataset_final/cropped/chris evans - Google Search
 Generating cropped images in folder: /content/drive/MyDrive/celebrit

In [8]:
#Check loaded file lists from cropped folders
print("\n Verifying files per celebrity:")
for celeb, files in celebrity_file_names_dict.items():
    print(f"{celeb}: {len(files)} files")



 Verifying files per celebrity:
Allyson_Felix_athletics: 19 files
Anthony_Davis_basketball: 17 files
Antoine_Griezmann_football: 10 files
Aryna_Sabalenka_tennis: 25 files
Babar_Azam_cricket: 9 files
Ben_Stokes_cricket: 5 files
Carlos_Alcaraz_tennis: 15 files
chris evans - Google Search: 48 files
Carl_Lewis_athletics: 6 files
chris hemsworth - Google Search: 38 files
Coco_Gauff_tennis: 10 files
Cristiano_Ronaldo_football: 32 files
Daniil_Medvedev_tennis: 17 files
Elaine_Thompson-Herah_athletics: 18 files
Florence_Griffith-Joyner_athletics: 9 files
Hardik_Pandya_cricket: 20 files
Harry_Kane_football: 31 files
Erling_Haaland_football: 16 files
Giannis_Antetokounmpo_basketball: 27 files
Iga_Świątek_tennis: 3 files
James_Harden_basketball: 10 files
Jannik_Sinner_tennis: 18 files
Jayson_Tatum_basketball: 26 files
Jasprit_Bumrah_cricket: 16 files
jennifer lawrence - Google Search: 17 files
Joe_Root_cricket: 12 files
Kane_Williamson_cricket: 10 files
Karim_Benzema_football: 14 files
Kevin_D

In [9]:
# Filter out classes with fewer than N(min_images) images
min_images = 40
filtered_celebrity_file_names_dict = {
    celeb: files
    for celeb, files in celebrity_file_names_dict.items()
    if len(files) >= min_images
}

print(f"\nAfter filtering, {len(filtered_celebrity_file_names_dict)} classes kept out of {len(celebrity_file_names_dict)}")

# Update downstream variables
celebrity_file_names_dict = filtered_celebrity_file_names_dict
class_dict = {name: idx for idx, name in enumerate(celebrity_file_names_dict.keys())}



After filtering, 3 classes kept out of 70


In [10]:

# Feature Matrix (X) and Labels (y)
X, y = [], []
for celebrity_name, training_files in celebrity_file_names_dict.items():
    for training_image in training_files:
        img = cv2.imread(training_image)
        scalled_raw_img = cv2.resize(img, (32, 32))
        img_har = w2d(img, 'db1', 5)
        scalled_img_har = cv2.resize(img_har, (32, 32))
        combined_img = np.vstack((
            scalled_raw_img.reshape(32 * 32 * 3, 1),
            scalled_img_har.reshape(32 * 32, 1)
        ))
        X.append(combined_img)
        y.append(class_dict[celebrity_name])

X = np.array(X).reshape(len(X), 4096).astype(float)
y = np.array(y)


In [11]:
# Train/Test Split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


# Train and Test different Models

## GridSearchCV for Best Model

In [12]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto', probability=True),
        'params': {
            'svc__C': [1, 10, 100],
            'svc__kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'randomforestclassifier__n_estimators': [5, 10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'logisticregression__C': [1, 5, 10]
        }
    }
}



In [13]:
scores = []
best_estimators = {}

for algo, mp in model_params.items():
    pipe = make_pipeline(StandardScaler(), mp['model'])
    clf = GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': algo,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[algo] = clf.best_estimator_

df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
print("\n Model Comparison:\n", df)





 Model Comparison:
                  model  best_score  \
0                  svm    0.843333   
1        random_forest    0.695333   
2  logistic_regression    0.860333   

                                    best_params  
0        {'svc__C': 1, 'svc__kernel': 'linear'}  
1  {'randomforestclassifier__n_estimators': 10}  
2                  {'logisticregression__C': 1}  


## GridSearchCV for KNN

In [14]:
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': [3, 4, 5,6, 7, 8, 9, 11, 12, 13, 14, 15],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]  # 1=Manhattan, 2=Euclidean
}

grid_search = GridSearchCV(
    knn_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)
print(f"Best Cross Validation Accuracy: {grid_search.best_score_:.4f}")

#FINAL EVALUATION

best_knn = grid_search.best_estimator_

y_pred = best_knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\n Test Accuracy: {acc*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best Parameters: {'knn__n_neighbors': 11, 'knn__p': 1, 'knn__weights': 'distance'}
Best Cross Validation Accuracy: 0.7360

 Test Accuracy: 70.73%

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.45      0.53        11
           1       0.79      0.85      0.81        13
           2       0.68      0.76      0.72        17

    accuracy                           0.71        41
   macro avg       0.70      0.69      0.69        41
weighted avg       0.70      0.71      0.70        41



## XGBOOST Classifier

In [15]:
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.01,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

xgb_model.fit(X_train, y_train)

# EVALUATE

y_pred = xgb_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\n Accuracy: {acc*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 Accuracy: 78.05%

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.55      0.63        11
           1       0.79      0.85      0.81        13
           2       0.79      0.88      0.83        17

    accuracy                           0.78        41
   macro avg       0.78      0.76      0.76        41
weighted avg       0.78      0.78      0.77        41



# Stack Classifier

In [16]:
base_models = [
    ('svm', SVC(kernel='rbf', probability=True, C=5)),
    ('rf', RandomForestClassifier(n_estimators=150, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5, weights='distance')),
    ('xgb', XGBClassifier(
        n_estimators=150,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='mlogloss'
    ))
]

meta_model = LogisticRegression(max_iter=1000)

# STACKING CLASSIFIER PIPELINE

stack_model = Pipeline([
    ('scaler', StandardScaler()),
    ('stack', StackingClassifier(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5,
        n_jobs=-1,
        passthrough=False
    ))
])


stack_model.fit(X_train, y_train)

# EVALUATION
y_pred = stack_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\n Stacking Model Accuracy: {acc*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


 Stacking Model Accuracy: 82.93%

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.64      0.74        11
           1       0.85      0.85      0.85        13
           2       0.80      0.94      0.86        17

    accuracy                           0.83        41
   macro avg       0.84      0.81      0.82        41
weighted avg       0.83      0.83      0.82        41



## GridSearchCV + Stack Classifier


In [17]:
#Tuning SVM
svm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(probability=True))
])
svm_params = {'svm__C': [1, 5, 10], 'svm__kernel': ['linear', 'rbf']}
svm_grid = GridSearchCV(svm_pipe, svm_params, cv=3, n_jobs=-1, verbose=1)
svm_grid.fit(X_train, y_train)
svm_best = svm_grid.best_estimator_
print(" Best SVM:", svm_grid.best_params_)
#Tuning KNN
knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])
knn_params = {'knn__n_neighbors': [3, 5, 7], 'knn__weights': ['uniform', 'distance']}
knn_grid = GridSearchCV(knn_pipe, knn_params, cv=3, n_jobs=-1, verbose=1)
knn_grid.fit(X_train, y_train)
knn_best = knn_grid.best_estimator_
print(" Best KNN:", knn_grid.best_params_)

# Tuning RandomForest
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    {'n_estimators': [100, 150], 'max_depth': [None, 10, 20]},
    cv=3, n_jobs=-1, verbose=1
)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
print(" Best RF:", rf_grid.best_params_)

# Tuning XGBoost
xgb_grid = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    {'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.05, 0.1]},
    cv=3, n_jobs=-1, verbose=1
)
xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_
print(" Best XGB:", xgb_grid.best_params_)


# STACKING CLASSIFIER
stack_model = StackingClassifier(
    estimators=[
        ('svm', svm_best),
        ('rf', rf_best),
        ('knn', knn_best),
        ('xgb', xgb_best)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=-1
)

# Training final stacking model
stack_model.fit(X_train, y_train)


# EVALUATE
y_pred = stack_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\n Final Stacking Accuracy: {acc*100:.2f}%")

# SAVE MODEL
os.makedirs("models", exist_ok=True)
joblib.dump(stack_model, "models/stacking_tuned_model.pkl")
joblib.dump(class_dict, "models/class_dict.pkl")
print("\n Saved final stacked model with tuned base learners.")


Fitting 3 folds for each of 6 candidates, totalling 18 fits
 Best SVM: {'svm__C': 1, 'svm__kernel': 'linear'}
Fitting 3 folds for each of 6 candidates, totalling 18 fits
 Best KNN: {'knn__n_neighbors': 5, 'knn__weights': 'distance'}
Fitting 3 folds for each of 6 candidates, totalling 18 fits
 Best RF: {'max_depth': None, 'n_estimators': 150}
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Best XGB: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}

 Final Stacking Accuracy: 87.80%

 Saved final stacked model with tuned base learners.
