In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob

from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import normalize
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold

from skimage.feature import local_binary_pattern # Local Binary Pattern function
from skimage import io

import cv2
from natsort import natsorted, ns
import os


In [None]:
# Create list of image names and corresponding gender classifications
image_dic = pd.read_excel('../Datasets/labels_A.xlsx')
image_dic = image_dic[['img_name.jpg', 'gender']] # Choose columns which are of importance
df = pd.DataFrame(image_dic)
df.to_excel('../Datasets/source_genders/labels_A1.xlsx',index=False)

In [None]:
# Separate male and female images and corresponding labels into folders

source_genders = pd.read_excel('../Datasets/source_genders/labels_A1.xlsx')
source_images_file_paths = glob.glob ("../Datasets/source_images/*.jpg") #find all paths which match the given path
source_images_file_paths = natsorted(source_images_file_paths) #sort the list of file names such that the image list will be in the correct order

male_images = []
female_images = []

male_directory = "../Datasets/sorted_sets_A1/male/"
female_directory = "../Datasets/sorted_sets_A1/female/"

for file_path in source_images_file_paths:
    image = cv2.imread(file_path, cv2.COLOR_RGB2BGR) #read the image
    image_name = os.path.basename(file_path)
    image_label = source_genders[source_genders['img_name.jpg']==image_name]['gender'].iloc[0]
    if(image_label == 1):
        male_images.append(image)
        directory = ''.join([male_directory,os.path.basename(image_name)])
        cv2.imwrite(directory, image)
    else:
        female_images.append(image)
        directory = ''.join([female_directory,os.path.basename(image_name)])
        cv2.imwrite(directory, image)


In [None]:
# Detecting faces and cropping

genders = ["male", "female"] #Define genders

faceDet = cv2.CascadeClassifier("../OpenCV_FaceCascade/haarcascade_frontalface_default.xml")
faceDet_two = cv2.CascadeClassifier("../OpenCV_FaceCascade/haarcascade_frontalface_alt2.xml")
faceDet_three = cv2.CascadeClassifier("../OpenCV_FaceCascade/haarcascade_frontalface_alt.xml")
faceDet_four = cv2.CascadeClassifier("../OpenCV_FaceCascade/haarcascade_frontalface_alt_tree.xml")

net = cv2.dnn.readNetFromCaffe("../deep-learning-face-detection/deploy.prototxt.txt",
                               "../deep-learning-face-detection/res10_300x300_ssd_iter_140000.caffemodel")
 
def DNN_Face_Detection(gender):
    files = glob.glob("../Datasets/sorted_sets_A1/%s/*.jpg" %gender) #Get list of all images with gender
    filenumber = 0
    for f in files:
        # load the input image and construct an input blob for the image
        # by resizing to a fixed 300x300 pixels and then normalizing it
        image = cv2.imread(f)
        (h, w) = image.shape[:2]
        
        BGR_ave = image.mean(axis=(0,1))
        blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (BGR_ave[2], BGR_ave[1], BGR_ave[0]))
        #blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
        net.setInput(blob)
        detections = net.forward()

        actual_detections = detections[0][0]
        detection_accuracies = detections[0][0][:,2]

        index_values = np.argsort(detection_accuracies)
        index_values = index_values[::-1]

        sorted_detections = [actual_detections[i] for i in index_values]
        sorted_detections = np.asarray(sorted_detections)

        confidence = sorted_detections[0, 2]

        # filter out weak detections by ensuring the `confidence` is greater than the minimum confidence
        if confidence > 0.9:
            # compute the (x, y)-coordinates of the bounding box for the object
            box = sorted_detections[0, 3:7] * np.array([w, h, w, h])
            (startX, startY, endX, endY) = box.astype("int")
            cropped = image[startY:endY, startX:endX] #Cut the frame to size
            try:
                out = cv2.resize(cropped, (300, 300)) #Resize face so all images have same size
                cv2.imwrite("../Datasets/A1_dataset_DNN/%s/%s.jpg" %(gender, filenumber), out) #Write image - don' need to worry about keeping track of labels associated because already sorted
                filenumber += 1 #Increment image number
            except:
                pass #If error, pass file
    
def detect_face_DNN(gender):
    files = glob.glob("../Datasets/sorted_sets_A1/%s/*.jpg" %gender) #Get list of all images with gender
    filenumber = 0
    for f in files:
        # load the input image and construct an input blob for the image
        # by resizing to a fixed 300x300 pixels and then normalizing it
        image = cv2.imread(f)
        (h, w) = image.shape[:2]
        BGR_ave = image.mean(axis=(0,1))
        blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (BGR_ave[2], BGR_ave[1], BGR_ave[0]))
        net.setInput(blob)
        detections = net.forward()
        print(detections.shape[2])
        # loop over the detections
        for i in range(0, detections.shape[2]):
            # extract the confidence (i.e., probability) associated with the
            # prediction
            confidence = detections[0, 0, i, 2]

            # filter out weak detections by ensuring the `confidence` is greater than the minimum confidence
            if confidence > 0.999:
                # compute the (x, y)-coordinates of the bounding box for the object
                box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
                (startX, startY, endX, endY) = box.astype("int")
                cropped = image[startY:endY, startX:endX] #Cut the frame to size
                try:
                    out = cv2.resize(cropped, (300, 300)) #Resize face so all images have same size
                    cv2.imwrite("../Datasets/A1_dataset_DNN/%s/%s.jpg" %(gender, filenumber), out) #Write image - don' need to worry about keeping track of labels associated because already sorted
                    filenumber += 1 #Increment image number
                except:
                    pass #If error, pass file


def detect_faces(gender):
    files = glob.glob("../Datasets/sorted_sets_A1/%s/*.jpg" %gender) #Get list of all images with gender
    filenumber = 0
    for f in files:
        frame = cv2.imread(f) #Open image
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) #Convert image to grayscale
        #Detect face using 4 different classifiers
        face = faceDet.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=10, minSize=(5, 5), flags=cv2.CASCADE_SCALE_IMAGE)
        face_two = faceDet_two.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=10, minSize=(5, 5), flags=cv2.CASCADE_SCALE_IMAGE)
        face_three = faceDet_three.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=10, minSize=(5, 5), flags=cv2.CASCADE_SCALE_IMAGE)
        face_four = faceDet_four.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=10, minSize=(5, 5), flags=cv2.CASCADE_SCALE_IMAGE)
        #Go over detected faces, stop at first detected face, return empty if no face.
        if len(face) == 1:
            facefeatures = face
        elif len(face_two) == 1:
            facefeatures = face_two
        elif len(face_three) == 1:
            facefeatures = face_three
        elif len(face_four) == 1:
            facefeatures = face_four
        else:
            facefeatures = ""
        #Cut and save face
        for (x, y, w, h) in facefeatures: #get coordinates and size of rectangle containing face
            #print ("face found in file: %s" %f)
            gray = gray[y:y+h, x:x+w] #Cut the frame to size
            try:
                out = cv2.resize(gray, (350, 350)) #Resize face so all images have same size
                cv2.imwrite("../Datasets/A1_dataset/%s/%s.jpg" %(gender, filenumber), out) #Write image - don' need to worry about keeping track of labels associated because already sorted
            except:
                pass #If error, pass file
        filenumber += 1 #Increment image number
for gender in genders:
    DNN_Face_Detection(gender) #Call function

In [24]:
# gender detection using LBPH  
#{'C': 10.0, 'decision_function_shape': 'OVO', 'gamma': 0.1, 'kernel': 'rbf'}

genders = ["male", "female"] #Define genders

c= 1
gamma = 0.1
tol = 0.1
method = 'OVO'

clf = SVC(kernel='rbf',C = c , gamma = gamma,decision_function_shape = method, tol = tol)

radius = 1    
no_neighbours = 8
LBPH = cv2.face.LBPHFaceRecognizer_create(radius, no_neighbours)

def get_files(gender): #Define function to get file list, randomly shuffle it and split 80/20
    files = glob.glob("../Datasets/A1_dataset_DNN/%s/*.jpg" %gender)
    return files

def get_features(image, r = 3):
    M = 25
    N = 25
    
    tiles = [image[x:x+M,y:y+N] for x in range(0,image.shape[0],M) for y in range(0,image.shape[1],N)]
    radius = r    
    no_points = 8*radius  # Number of points to be considered as neighbourers
    eps=1e-7    
    global_hist_norm = []
    for tile in tiles:
        lbp = local_binary_pattern(tile, no_points, radius, method='uniform') # Uniform LBP is used
        (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, no_points + 2), range=(0, no_points + 2))
        hist = hist.astype("float")
        hist /= (hist.sum() + eps)
        hist_norm = hist
        global_hist_norm.append(hist_norm)
    s = np.asarray(global_hist_norm).shape
    global_hist_norm = np.reshape(global_hist_norm,(s[0]*s[1]))
    return global_hist_norm

def SVM_paramter_tuning(X_train, y_train, X_test, y_test):
    
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.001, 0.01, 0.1, 1,10,'scale'],
                        'C': [0.001, 0.01, 0.1, 1, 10,100,1000],
                        'decision_function_shape':['OVO']}]    
  
    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()
        clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % score)
        clf.fit(X_train, y_train)
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()
        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()

def make_sets(r):
    image_inputs = []
    image_labels = []
    for gender in genders:
        image_files = get_files(gender)
        #Append data and generate labels 0-1
        for item in image_files:
            image = cv2.imread(item) #open image
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #convert to grayscale
            features = get_features(gray,r)
            image_inputs.append(features) #append image array to training data list
            image_labels.append(genders.index(gender))
            
    training_data, prediction_data, training_labels, prediction_labels = train_test_split(image_inputs,image_labels)
    print(len(training_data))
    print(len(training_labels))
    print(len(prediction_data))
    print(len(prediction_labels))
    return training_data, training_labels, prediction_data, prediction_labels,image_inputs,image_labels

def train_SVM_model(xTrain, yTrain):
    print ("training SVM classifier")
    print ("size of training set is:", len(yTrain), "images")
    clf.fit(xTrain, yTrain)

def run_SVM_classifier(xTest,yTest):
    yPredict = clf.predict(xTest)
    return accuracy_score(yTest,yPredict)

In [None]:
# Parameter Tuning
training_data_SVM, training_labels_SVM, prediction_data_SVM, prediction_labels_SVM,image_inputs,image_labels = make_sets(3)
SVM_paramter_tuning(training_data_SVM, training_labels_SVM, prediction_data_SVM, prediction_labels_SVM)


In [25]:
# Main

training_data_SVM, training_labels_SVM, prediction_data_SVM, prediction_labels_SVM,image_inputs,image_labels = make_sets(1)
train_SVM_model(training_data_SVM, training_labels_SVM)
correct_SVM = run_SVM_classifier(prediction_data_SVM, prediction_labels_SVM)

print ("SVM got", correct_SVM, "percent correct!")


3738
3738
1247
1247
training SVM classifier
size of training set is: 3738 images
SVM got 0.8917401764234162 percent correct!


In [26]:
print(np.asarray(training_data_SVM).shape)
print(len(training_data_SVM))


(3738, 1296)
3738


In [None]:
#LBP parameter tuning
acc1 = 0
acc2 = 0
r = 1
while (True):
    training_data_SVM, training_labels_SVM, prediction_data_SVM, prediction_labels_SVM = make_sets(r)
    train_SVM_model(training_data_SVM, training_labels_SVM)
    acc1 = run_SVM_classifier(prediction_data_SVM, prediction_labels_SVM)
    print(acc1)
    print(r)
    if(acc1 > acc2):
        acc2 = acc1
        r +=1
    else:
        break



In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 10)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    axes : array of 3 axes, optional (default=None)
        Axes to use for plotting the curves.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    plt.title(title)
    plt.ylim(ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    print(train_sizes)
    # Plot learning curve
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    plt.legend(loc="best")

    return plt


X, y = training_data_SVM, training_labels_SVM

print(X[:10])
print(y)

title = "Learning Curves (SVM, RBF kernel, $\gamma=0.01, C = 10, decision function shape = OVO$)"

# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
# SVC is more expensive so we do a lower number of CV iterations:
kf = KFold(n_splits=10)

c= 10
gamma = 0.1
tol = 0.1
method = 'OVO'

estimator = SVC(kernel='rbf',C = c , gamma = gamma,decision_function_shape = method, tol = tol)
plt = plot_learning_curve(estimator, title, X, y, ylim=(0.75, 1.01),
                    cv=kf, n_jobs=4)
plt.savefig('../plots_A1/10_learningcurve.png')
plt.show()


In [18]:
# Test Model Using Test Set 

def sort_data(testlabels_dir): 
    # Create list of image names and corresponding gender classifications
    image_dic = pd.read_excel(testlabels_dir)
    print(image_dic)
    image_dic = image_dic[['img_name', 'gender']] # Choose columns which are of importance
    df = pd.DataFrame(image_dic)
    df.to_excel('../Datasets/test_source_genders_A1/test_labels_A1.xlsx',index=False)
    
    # Separate male and female images and corresponding labels into folders

    source_genders = pd.read_excel('../Datasets/test_source_genders_A1/test_labels_A1.xlsx')
    source_images_file_paths = glob.glob ("../Datasets/test_source_images_A1/*.jpg") #find all paths which match the given path
    source_images_file_paths = natsorted(source_images_file_paths) #sort the list of file names such that the image list will be in the correct order

    male_images = []
    female_images = []

    male_directory = "../Datasets/test_sorted_sets_A1/male/"
    female_directory = "../Datasets/test_sorted_sets_A1/female/"

    for file_path in source_images_file_paths:
        image = cv2.imread(file_path, cv2.COLOR_RGB2BGR) #read the image
        image_name = os.path.basename(file_path)
        image_label = source_genders[source_genders['img_name']==image_name]['gender'].iloc[0]
        if(image_label == 1):
            male_images.append(image)
            directory = ''.join([male_directory,os.path.basename(image_name)])
            cv2.imwrite(directory, image)
        else:
            female_images.append(image)
            directory = ''.join([female_directory,os.path.basename(image_name)])
            cv2.imwrite(directory, image)

            
def test_DNN_Face_Detection(gender):
        net = cv2.dnn.readNetFromCaffe("../deep-learning-face-detection/deploy.prototxt.txt",
                                   "../deep-learning-face-detection/res10_300x300_ssd_iter_140000.caffemodel")
        files = glob.glob("../Datasets/test_sorted_sets_A1/%s/*.jpg" %gender) #Get list of all images with gender
        filenumber = 0
        for f in files:
            # load the input image and construct an input blob for the image
            # by resizing to a fixed 300x300 pixels and then normalizing it
            image = cv2.imread(f)
            (h, w) = image.shape[:2]

            BGR_ave = image.mean(axis=(0,1))
            blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (BGR_ave[2], BGR_ave[1], BGR_ave[0]))
            net.setInput(blob)
            detections = net.forward()

            actual_detections = detections[0][0]
            detection_accuracies = detections[0][0][:,2]

            index_values = np.argsort(detection_accuracies)
            index_values = index_values[::-1]

            sorted_detections = [actual_detections[i] for i in index_values]
            sorted_detections = np.asarray(sorted_detections)

            confidence = sorted_detections[0, 2]

            # filter out weak detections by ensuring the `confidence` is greater than the minimum confidence
            if confidence > 0.9:
                # compute the (x, y)-coordinates of the bounding box for the object
                box = sorted_detections[0, 3:7] * np.array([w, h, w, h])
                (startX, startY, endX, endY) = box.astype("int")
                cropped = image[startY:endY, startX:endX] #Cut the frame to size
                try:
                    out = cv2.resize(cropped, (300, 300)) #Resize face so all images have same size
                    cv2.imwrite("../Datasets/test_A1_dataset/%s/%s.jpg" %(gender, filenumber), out) #Write image - don' need to worry about keeping track of labels associated because already sorted
                    filenumber += 1 #Increment image number
                except:
                    print("oops")
                    pass #If error, pass file            
            
def face_detecion():
    genders = ["male", "female"] #Define genders
    for gender in genders:
        test_DNN_Face_Detection(gender)    

                
def make_test_sets(r):
    image_inputs = []
    image_labels = []
    for gender in genders:
        image_files = get_test_files(gender)
        #Append data and generate labels 0-1
        for item in image_files:
            image = cv2.imread(item) #open image
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #convert to grayscale
            features = get_test_features(gray,r)
            image_inputs.append(features) #append image array to training data list
            image_labels.append(genders.index(gender))
            
    training_data, prediction_data, training_labels, prediction_labels = train_test_split(image_inputs,image_labels)
    print(len(training_data))
    print(len(training_labels))
    print(len(prediction_data))
    print(len(prediction_labels))
    return training_data, training_labels, prediction_data, prediction_labels,image_inputs,image_labels


def get_test_files(gender): #Define function to get file list, randomly shuffle it and split 80/20
    files = glob.glob("../Datasets/test_A1_dataset/%s/*.jpg" %gender)
    return files

def get_test_features(image, r = 3):
    M = 25
    N = 25
    tiles = [image[x:x+M,y:y+N] for x in range(0,image.shape[0],M) for y in range(0,image.shape[1],N)]
    radius = r    
    no_points = 8*radius  # Number of points to be considered as neighbourers
    eps=1e-7    
    global_hist_norm = []
    for tile in tiles:
        lbp = local_binary_pattern(tile, no_points, radius, method='uniform') # Uniform LBP is used
        (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, no_points + 2), range=(0, no_points + 2))
        hist = hist.astype("float")
        hist /= (hist.sum() + eps)
        hist_norm = hist
        global_hist_norm.append(hist_norm)
    s = np.asarray(global_hist_norm).shape
    global_hist_norm = np.reshape(global_hist_norm,(s[0]*s[1]))
    return global_hist_norm


#Main

test_dir = '../Datasets/test_labels_A.xlsx'
sort_data(test_dir)
face_detecion()
_, _, _, _, test_image_inputs, test_image_labels = make_test_sets(1)


     Unnamed: 0 img_name  gender  smiling
0             0    0.jpg      -1       -1
1             1    1.jpg      -1        1
2             2    2.jpg       1        1
3             3    3.jpg       1        1
4             4    4.jpg      -1       -1
..          ...      ...     ...      ...
995         995  995.jpg       1        1
996         996  996.jpg       1        1
997         997  997.jpg       1        1
998         998  998.jpg       1        1
999         999  999.jpg       1        1

[1000 rows x 4 columns]
747
747
249
249


In [19]:
def run_SVM_classifier(xTest,yTest):
    yPredict = clf.predict(xTest)
    return accuracy_score(yTest,yPredict)

accuracy = run_SVM_classifier(test_image_inputs,test_image_labels)
print(accuracy)

0.8504016064257028
