In [1]:
import cv2 as cv
import os
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import joblib
from transformers import ViTModel, ViTFeatureExtractor
import torch
import sys
sys.path.append('/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Face_Recognition/yolov5')
from yolov5 import YOLOv5
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# import yolov5

# Import YOLOv5 for face detection
from yolov5.models.experimental import attempt_load
# from yolov5.utils.datasets import LoadImages
from yolov5.utils.general import non_max_suppression
from yolov5.utils.torch_utils import select_device

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

class FACELOADING:
    def __init__(self, directory):
        self.directory = directory
        self.target_size = (224, 224)
        self.model = YOLOv5('yolov5s.pt', device='cpu')

    def extract_face(self, filename):
        img = cv.imread(filename)
        if img is None:
            print(f"Failed to load image {filename}")
            return None
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)

        # Predict using the model
        results = self.model.predict(img, size=640)  # Adjust size as necessary for your model

        # Process results
        if results.xyxy[0].shape[0]:  # Check if any detections were made
            x1, y1, x2, y2 = results.xyxy[0][0][:4]  # Coordinates of the first detection
            face = img[int(y1):int(y2), int(x1):int(x2)]
            face_arr = cv.resize(face, self.target_size)
            return face_arr
        else:
            print(f"No faces detected in {filename}")
            return None

    def load_faces(self, dir):
        faces = []
        # Define accepted image extensions
        valid_extensions = ('.jpg', '.jpeg')
        for im_name in os.listdir(dir):
            # Check for valid image extensions before processing
            if im_name.lower().endswith(valid_extensions):
                path = os.path.join(dir, im_name)
                if os.path.isfile(path):
                    try:
                        single_face = self.extract_face(path)
                        if single_face is not None:
                            faces.append(single_face)
                    except Exception as e:
                        print(f"Error processing {path}: {e}")
        return faces

    def load_classes(self):
        self.X = []  # List to hold image data
        self.Y = []  # List to hold labels
        for sub_dir in os.listdir(self.directory):
            path = os.path.join(self.directory, sub_dir)
            if os.path.isdir(path):
                faces = self.load_faces(path)
                self.X.extend(faces)
                self.Y.extend([sub_dir] * len(faces))
        return np.asarray(self.X), np.asarray(self.Y)
    
def get_embedding(face_img):
    inputs = feature_extractor(images=face_img, return_tensors="pt")
    outputs = model_vit(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy().flatten()  # Flatten the embedding

# Initialize the Vision Transformer and its feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model_vit = ViTModel.from_pretrained('google/vit-base-patch16-224')

# Load dataset
faceloading = FACELOADING("Dataset")
X, Y = faceloading.load_classes()

print(f"Number of images loaded: {len(X)}")
print(f"Number of labels: {len(Y)}")

# Process and embed faces using Vision Transformer
EMBEDDED_X = [get_embedding(img).flatten() for img in X]
EMBEDDED_X = np.asarray(EMBEDDED_X)

# Encode labels
encoder = LabelEncoder()
Y_enc = encoder.fit_transform(Y)

# Train test split
X_train, X_test, Y_train, Y_test = train_test_split(EMBEDDED_X, Y_enc, shuffle=True, random_state=17)
# Check shapes before training
print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

# Train model
model = SVC(kernel='linear', probability=True)
model.fit(X_train, Y_train)

# Evaluate training accuracy
train_accuracy = model.score(X_train, Y_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Evaluate test accuracy
test_accuracy = model.score(X_test, Y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate classification report
Y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred, target_names=encoder.classes_))

# Generate confusion matrix
cm = confusion_matrix(Y_test, Y_pred)

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Define the function to plot learning curves
def plot_learning_curves(model, X, y, title='Learning Curves'):
    from sklearn.model_selection import learning_curve

    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 10), scoring='accuracy')

    # Calculate mean and standard deviation for training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)

    # Calculate mean and standard deviation for test set scores
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, train_mean, 'o-', color="r",  label="Training score")
    plt.plot(train_sizes, test_mean, 'o-', color="g", label="Cross-validation score")

    # Draw bands for standard deviation
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="g")

    plt.title(title)
    plt.xlabel("Training Set Size")
    plt.ylabel("Accuracy Score")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

# Plot learning curves using the defined function
plot_learning_curves(model, X_train, Y_train, title='Learning Curve for Face Recognition Model')


# Save model and encoder
# joblib.dump(model, 'trained_model/face_recognition_model.pkl')
# joblib.dump(encoder, 'trained_model/label_encoder.pkl')

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No faces detected in Dataset/Kuldeep_Yadav/download (7).jpeg
No faces detected in Dataset/Jasprit_Bumrah/download (10).jpeg
No faces detected in Dataset/Ravindra_Jadeja/download.jpeg
No faces detected in Dataset/Ravindra_Jadeja/download (1).jpeg
Number of images loaded: 117
Number of labels: 117
Training data shape: (87, 768)
Test data shape: (30, 768)
Training Accuracy: 1.0000
Test Accuracy: 0.6667

Classification Report:
                 precision    recall  f1-score   support

 Arshdeep_Singh       1.00      1.00      1.00         5
     Axar_Patel       0.56      0.83      0.67         6
 Jasprit_Bumrah       0.75      0.50      0.60         6
  Kuldeep_Yadav       0.60      0.60      0.60         5
Ravindra_Jadeja       0.40      0.50      0.44         4
    Virat_Kohli       1.00      0.50      0.67         4

       accuracy                           0.67        30
      macro avg       0.72      0.66      0.66        30
   weighted avg       0.71      0.67      0.67        30



In [2]:
import cv2 as cv
import numpy as np
import os
import torch
import joblib
from transformers import ViTModel, ViTFeatureExtractor
from yolov5 import YOLOv5

# Ensure TensorFlow logging is controlled
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Load the Vision Transformer model for embeddings and the feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model_vit = ViTModel.from_pretrained('google/vit-base-patch16-224')

# Load the SVM model and label encoder
model = joblib.load('trained_model/face_recognition_model.pkl')
encoder = joblib.load('trained_model/label_encoder.pkl')

# Initialize YOLOv5 for face detection
detector = YOLOv5('yolov5s.pt', device='cpu')

# Function to get embeddings using Vision Transformer
def get_embedding(face_img):
    inputs = feature_extractor(images=face_img, return_tensors="pt")
    outputs = model_vit(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy().flatten()  # Flatten the embedding


# Select webcam
cap = cv.VideoCapture(0)

while cap.isOpened():
    _, frame = cap.read()
    rgb_img = cv.cvtColor(frame, cv.COLOR_BGR2RGB)

    # Detect faces in the frame
    results = detector.predict(rgb_img, size=640)
    for det in results.xyxy[0]:
        x1, y1, x2, y2 = map(int, det[:4])
        face = rgb_img[y1:y2, x1:x2]
        face = cv.resize(face, (224, 224))  # Resize to match ViT input

        # Generate embedding for the detected face
        embedding = get_embedding(face)
        embedding = np.expand_dims(embedding, axis=0)

        # Predict the identity of the face
        ypred = model.predict(embedding)
        if model.predict_proba(embedding).max() > 0.5:  # Confidence threshold
            final_name = encoder.inverse_transform(ypred)[0]
        else:
            final_name = "Unknown"

        # Display the identity and a bounding box around the face
        cv.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 255), 2)
        cv.putText(frame, str(final_name), (x1, y1-10), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv.LINE_AA)

    cv.imshow("Face Recognition", frame)
    if cv.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv.destroyAllWindows()

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 