#Final Project on MBA 

#Gender Detection on Celeb Faces 


##Stage 1: Import the libraries


In [None]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Conv2D, BatchNormalization, MaxPooling2D, Dropout, Flatten, Dense, Activation, GlobalAveragePooling2D
from keras import backend as K
from tensorflow.keras.optimizers import Adam, SGD
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import f1_score
from keras.applications.inception_v3 import InceptionV3, preprocess_input

## Stage 2: Import dataset

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [None]:
! kaggle datasets download jessicali9530/celeba-dataset
! unzip celeba-dataset.zip

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.7/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 166, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.
unzip:  cannot find or open celeba-dataset.zip, celeba-dataset.zip.zip or celeba-dataset.zip.ZIP.


###Stage2i: Unzip the dataset imported

In [None]:

! unzip celeba-dataset.zip

unzip:  cannot find or open celeba-dataset.zip, celeba-dataset.zip.zip or celeba-dataset.zip.ZIP.


##Stage 3: Data Exploration

### 3i: Set Variables & replace all negative values to Zero


In [None]:
DATA_PATH = 'img_align_celeba/img_align_celeba/'
ATTRIBUTE_PATH = 'list_attr_celeba.csv'
PARTITION_PATH = 'list_eval_partition.csv'

EXAMPLE_PIC = DATA_PATH + '000001.jpg'
IMG_WIDTH = 178
IMG_HEIGHT = 218

TRAINING_SAMPLES = 10000
VALIDATION_SAMPLES = 1000
TEST_SAMPLES = 1000

BATCH_SIZE = 16
NUM_EPOCHS = 20
INPUT_SHAPE = (IMG_HEIGHT, IMG_WIDTH, 3)

In [None]:
# list_attr_celeba.csv
attr_df = pd.read_csv(ATTRIBUTE_PATH, index_col='image_id')
attr_df.head()

ValueError: ignored

In [None]:
attr_df.replace(to_replace=-1, value=0, inplace=True)
attr_df.head()

##3ii: Print out Attributes in the dataset

In [None]:
# Attributes
print('Attributes:')
for i, j in enumerate(attr_df.columns):
    print('    {:02d}: {}'.format(i,j))

In [None]:
# Gender distribution
plt.title('Gender Distribution')
sns.countplot(y = 'Male', data=attr_df)
plt.show()

In [None]:
# Plotting an example
img = load_img(EXAMPLE_PIC)

plt.imshow(img)
plt.show()

#Stage 4: Training, Testing and Validation

#### list_eval_partition.csv

In [None]:
parti_df = pd.read_csv(PARTITION_PATH, index_col='image_id')
parti_df.head()

#### Partitions

In [None]:
parti_df.value_counts()

#### Joining csv

In [None]:
sampling_df = attr_df[['Male']].join(parti_df)
sampling_df.head()

### Sampling of the dataset

In [None]:
def load_reshape_img(filename):
    img = load_img(filename)
    img_array = img_to_array(img)/255
    img_array = img_array.reshape((1,) + img_array.shape)
    return img_array
    

def sampling(partition, sample_size, sampling_df):
    parti_mask = sampling_df['partition'] == partition
    male_mask = sampling_df['Male'] == 1
    female_mask = sampling_df['Male'] == 0
    sampled_df = pd.concat([
        sampling_df[parti_mask & male_mask].sample(sample_size//2),
        sampling_df[parti_mask & female_mask].sample(sample_size//2)
    ])
    
    if partition != 2:
        x = np.array([load_reshape_img(DATA_PATH + filename) for filename in sampled_df.index])
        x = x.reshape(x.shape[0], 218, 178, 3)
        y = np_utils.to_categorical(sampled_df['Male'],2)
    else:
        x = []
        y = []
        for index, target in sampled_df.iterrows():
            im = cv2.imread(DATA_PATH + index) # return BGR
            im = cv2.resize(cv2.cvtColor(im, cv2.COLOR_BGR2RGB), (IMG_WIDTH, IMG_HEIGHT)).astype(np.float32) / 255.0 # convert it to RGB for consistency
            im = np.expand_dims(im, axis =0)
            x.append(im)
            y.append(target['Male'])
            
    return x, y

#### Splitting of the dataset

In [None]:

x_train, y_train = sampling(0, TRAINING_SAMPLES, sampling_df)
x_valid, y_valid = sampling(1, VALIDATION_SAMPLES, sampling_df)
x_test, y_test = sampling(2, TEST_SAMPLES, sampling_df)

##Data Augmentation to get rid of overfitting

In [None]:
train_datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

train_datagen.fit(x_train)

train_generator = train_datagen.flow(x_train, y_train,batch_size=BATCH_SIZE)

#Stage 5: Modelling


##Build a Model

In [None]:
model = Sequential()
model.add(Conv2D(16, (3,3), input_shape = INPUT_SHAPE, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size = (2,2)))

model.add(Conv2D(32, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size = (2,2)))

model.add(Conv2D(64, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size = (2,2)))
model.add(Dropout(rate = 0.5))

model.add(Conv2D(64, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size = (2,2)))
model.add(Dropout(rate = 0.5))

model.add(Flatten())

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate = 0.5))

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate = 0.5))

model.add(Dense(2))
model.add(Activation('softmax'))

model.summary()

##Compile the Model

In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
checkpointer = ModelCheckpoint(filepath='model.h5', verbose=1, save_best_only=True)

##Fit the Model

In [None]:
hist = model.fit(
    train_generator,
    validation_data = (x_valid, y_valid),
    steps_per_epoch= TRAINING_SAMPLES/BATCH_SIZE,
    epochs= 20,
    callbacks=[checkpointer],
    verbose=1
)

#### Plot loss function value using epochs

In [None]:
plt.figure(figsize=(18, 4))
plt.plot(hist.history['loss'], label = 'train')
plt.plot(hist.history['val_loss'], label = 'valid')
plt.legend()
plt.title('Loss Function')
plt.show()

#### Plot accuracy using epochs

In [None]:
plt.figure(figsize=(18, 4))
plt.plot(hist.history['accuracy'], label = 'train')
plt.plot(hist.history['val_accuracy'], label = 'valid')
plt.legend()
plt.title('Accuracy')
plt.show()

##Model Evaluation 

In [None]:
model.load_weights('model.h5')
predictions = [np.argmax(model.predict(image)) for image in x_test]
accuracy = 100 * np.sum(np.array(predictions)==y_test) / len(predictions)
f1 = f1_score(y_test, predictions)

print('Accuracy: {:.4f}'.format(accuracy))
print('F1 score: {:.4f}'.format(f1))

#dictionary to name the prediction

In [None]:
gender_target = {0: 'Female'
                , 1: 'Male'}

def img_to_display(filename):
    # inspired on this kernel:
    # https://www.kaggle.com/stassl/displaying-inline-images-in-pandas-dataframe
    # credits to stassl :)
    
    i = Image.open(filename)
    i.thumbnail((200, 200), Image.LANCZOS)
    
    with BytesIO() as buffer:
        i.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()
    

def display_result(filename, prediction, target):
    gender = 'Male'
    gender_icon = "https://png.pngtree.com/png-clipart/20190705/original/pngtree-man-avatar-icon-professional-man-character-png-image_4356027.jpg"
        
    if prediction[1] <= 0.5:
        gender_icon = "https://png.pngtree.com/png-clipart/20190614/original/pngtree-female-avatar-vector-icon-png-image_3725439.jpg"
        gender = 'Female'
            
    display_html = '''
    <div style="overflow: auto;  border: 2px solid #D8D8D8;
        padding: 5px; width: 480px;" >
        <img src="data:image/jpeg;base64,{}" style="float: left;" width="200" height="200">
        <div style="padding: 10px 0px 0px 20px; overflow: auto;">
            <img src="{}" style="float: left;" width="40" height="40">
            <h3 style="margin-left: 50px; margin-top: 2px;">Prediction: {}</h3>
            <p style="margin-left: 50px; margin-top: -6px; font-size: 12px">{} probability</p>
            <p style="margin-left: 50px; margin-top: -16px; font-size: 12px">Real gender: {}</p>
        </div>
    </div>
    '''.format(img_to_display(filename)
               , gender_icon
               , gender
               , "{0:.2f}%".format(round(max(prediction)*100,2))
               , gender_target[target]
               , filename.split('/')[-1]
               )

    display(HTML(display_html))

In [None]:
def gender_prediction(filename):

      im = cv2.imread(filename)
      im = cv2.resize(cv2.cvtColor(im, cv2.COLOR_BGR2RGB), (178, 218)).astype(np.float32) / 255.0
      im = np.expand_dims(im, axis =0)
      
      # prediction
      result = model.predict(im)
      prediction = np.argmax(result)
      
      return result

##Display prediction result

In [None]:
from IPython.core.display import display, HTML
from PIL import Image
from io import BytesIO
import base64

plt.style.use('ggplot')

%matplotlib inline
#select random images of the test partition
df_to_test = sampling_df[(sampling_df['partition'] == 2)].sample(10)

for index, target in df_to_test.iterrows():
    result = gender_prediction(DATA_PATH + index)
    
    #display result
    display_result(DATA_PATH + index, result[0], target['Male'])

# **Report Analysis**

#Image identification is one of Machine Learning's many uses; it may help with security, item detection, face detection, healthcare, and entertainment, among other things. Because this application has such a large potential to benefit our society, it's critical to identify new applications for it, improve present approaches, and gain more accurate and meaningful insights from it.

##Data Description
1.   202,599 number of face images of various celebrities
2.   10,177 unique identities, but names of identities are not given
3.   40 binary attribute annotations per image




# In this study, we used a CNN-based Machine Learning Algorithm to determine if a celebrity is male or female based on the given dataset. This dataset was useful for recognising facial attributes such as people with dark or wavy hair and are smiling. 

##Accuracy Result

# On the Train dataset, we ran 20 epochs and had the greatest gender prediction accuracy of 89.6%. On the Validation dataset, we achieved a 95.1% accuracy on the 19th epoch and saw a reduction as it approached the 20th epoch, with an accuracy of 92.6%.


#The model performed well on the train dataset in general, with a growing accuracy score at each epoch and a lowering loss function. The train dataset shows that the better the accuracy, the lesser the loss.

#While on the Validation set, it began with an accuracy level of 80% and was unstable until it reached the 11th epoch and maintained a >90% accuracy. The loss on the validation set likewise fluctuated, ranging from an 83% loss to a 19.7% loss.
