# Dependencies

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install -q kaggle

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Activation, Dropout, Flatten, Dense, Dropout, LayerNormalization
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import img_to_array, load_img
from tensorflow import keras
%load_ext tensorboard

import os

# Data Loading

The dataset is represented by 5 tables, each representing a disjuntive subset of the dataset

For our purpose we won't need this partition of the data, so we'll just merge them in a single table representing the dataset as a whole

In [4]:
def create_kaggle_dir(kaggle_dir="/root/.kaggle/"):
    if not os.path.exists(kaggle_dir):
        !mkdir $kaggle_dir

def retrieve_kaggle_json(source_dir="/content/drive/MyDrive/", kaggle_dir="/root/.kaggle/", file_name="kaggle.json"):
    source_file = os.path.join(source_dir, file_name)
    target_file = os.path.join(kaggle_dir, file_name)

    if not os.path.exists(target_file):
        !cp $source_file $kaggle_dir
        !chmod 600 $target_file

def download_adience_dataset():
    if not os.path.exists("/content/adience-benchmark-gender-and-age-classification.zip"):
        !kaggle datasets download -d ttungl/adience-benchmark-gender-and-age-classification

def unzip_adience_dataset():
    if not os.path.exists("/content/adience-benchmark-gender-and-age-classification"):
        !unzip adience-benchmark-gender-and-age-classification.zip -d adience-benchmark-gender-and-age-classification > /dev/null 2>&1

def dataset_pipeline():
    create_kaggle_dir()
    retrieve_kaggle_json()
    download_adience_dataset()
    unzip_adience_dataset()

dataset_pipeline()

Downloading adience-benchmark-gender-and-age-classification.zip to /content
100% 1.38G/1.39G [00:12<00:00, 122MB/s]
100% 1.39G/1.39G [00:12<00:00, 122MB/s]


In [5]:
fold_0=pd.read_table('/content/adience-benchmark-gender-and-age-classification/AdienceBenchmarkGenderAndAgeClassification/fold_0_data.txt')
fold_1=pd.read_table('/content/adience-benchmark-gender-and-age-classification/AdienceBenchmarkGenderAndAgeClassification/fold_1_data.txt')
fold_2=pd.read_table('/content/adience-benchmark-gender-and-age-classification/AdienceBenchmarkGenderAndAgeClassification/fold_2_data.txt')
fold_3=pd.read_table('/content/adience-benchmark-gender-and-age-classification/AdienceBenchmarkGenderAndAgeClassification/fold_3_data.txt')
fold_4=pd.read_table('/content/adience-benchmark-gender-and-age-classification/AdienceBenchmarkGenderAndAgeClassification/fold_4_data.txt')

total_data = pd.concat([fold_0, fold_1, fold_2, fold_3, fold_4], ignore_index=True)
print(fold_0.shape)
print(total_data.shape)
print()
print(total_data.info())

(4484, 12)
(19370, 12)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19370 entries, 0 to 19369
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             19370 non-null  object
 1   original_image      19370 non-null  object
 2   face_id             19370 non-null  int64 
 3   age                 19370 non-null  object
 4   gender              18591 non-null  object
 5   x                   19370 non-null  int64 
 6   y                   19370 non-null  int64 
 7   dx                  19370 non-null  int64 
 8   dy                  19370 non-null  int64 
 9   tilt_ang            19370 non-null  int64 
 10  fiducial_yaw_angle  19370 non-null  int64 
 11  fiducial_score      19370 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.8+ MB
None


In [6]:
total_data.head()

Unnamed: 0,user_id,original_image,face_id,age,gender,x,y,dx,dy,tilt_ang,fiducial_yaw_angle,fiducial_score
0,30601258@N03,10399646885_67c7d20df9_o.jpg,1,"(25, 32)",f,0,414,1086,1383,-115,30,17
1,30601258@N03,10424815813_e94629b1ec_o.jpg,2,"(25, 32)",m,301,105,640,641,0,0,94
2,30601258@N03,10437979845_5985be4b26_o.jpg,1,"(25, 32)",f,2395,876,771,771,175,-30,74
3,30601258@N03,10437979845_5985be4b26_o.jpg,3,"(25, 32)",m,752,1255,484,485,180,0,47
4,30601258@N03,11816644924_075c3d8d59_o.jpg,2,"(25, 32)",m,175,80,769,768,-75,0,34


# Data Processing

In [None]:
df = total_data[['age', 'gender', 'x', 'y', 'dx', 'dy']].copy()
df.info()

# Adding Path of each Image
img_path = []
for row in total_data.iterrows():
    path = "/content/adience-benchmark-gender-and-age-classification/AdienceBenchmarkGenderAndAgeClassification/faces/"+row[1].user_id+"/coarse_tilt_aligned_face."+str(row[1].face_id)+"."+row[1].original_image
    img_path.append(path)
df['img_path'] = img_path

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19370 entries, 0 to 19369
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     19370 non-null  object
 1   gender  18591 non-null  object
 2   x       19370 non-null  int64 
 3   y       19370 non-null  int64 
 4   dx      19370 non-null  int64 
 5   dy      19370 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 908.1+ KB


In [None]:
df.head()

Unnamed: 0,age,gender,x,y,dx,dy,img_path
0,"(25, 32)",f,0,414,1086,1383,/content/adience-benchmark-gender-and-age-clas...
1,"(25, 32)",m,301,105,640,641,/content/adience-benchmark-gender-and-age-clas...
2,"(25, 32)",f,2395,876,771,771,/content/adience-benchmark-gender-and-age-clas...
3,"(25, 32)",m,752,1255,484,485,/content/adience-benchmark-gender-and-age-clas...
4,"(25, 32)",m,175,80,769,768,/content/adience-benchmark-gender-and-age-clas...


In [None]:
# How many unique Age are there
df.age.unique()

array(['(25, 32)', '(38, 43)', '(4, 6)', '(60, 100)', '(15, 20)',
       '(48, 53)', '(8, 12)', '(0, 2)', 'None', '(38, 48)', '35', '3',
       '55', '58', '22', '13', '45', '36', '23', '(38, 42)', '(8, 23)',
       '(27, 32)', '57', '56', '2', '29', '34', '42', '46', '32'],
      dtype=object)

In [None]:
# Mapping Ages to bring All the Ages into there respective Range
age_mapping = [('(0, 2)', '0-3'), ('2', '0-3'), ('3', '0-3'), 
               ('(4, 6)', '4-7'), 
               ('(8, 12)', '8-14'), ('13', '8-14'), 
               ('22', '15-24'), ('(8, 23)','15-24'), ('(15, 20)', '15-24'), ('23', '15-24'),
               ('(25, 32)', '25-34'), ('(27, 32)', '25-34'), ('32', '25-34'), ('34', '25-34'), ('29', '25-34'), 
               ('(38, 42)', '35-44'), ('35', '35-44'), ('36', '35-44'), ('(38, 43)', '35-44'), ('(38, 42)', '35-44'), ('42', '35-44'),('(38, 48)', '45-54'),
               ('45', '45-54'), ('46', '45-54'), ('(48, 53)', '45-54'), 
               ('55', '55+'), ('56', '55+'), ('57', '55+'), ('58', '55+'), ('(60, 100)', '55+')]
age_mapping_dict = {each[0]: each[1] for each in age_mapping}
drop_labels = []
for idx, each in enumerate(df.age):
    if each == 'None':
        drop_labels.append(idx)
    else:
        df.age.loc[idx] = age_mapping_dict[each]
df = df.drop(labels=drop_labels, axis=0) #droped None values
df.age.value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.age.loc[idx] = age_mapping_dict[each]


25-34    5200
35-44    2689
0-3      2509
8-14     2292
4-7      2140
15-24    1888
55+       979
45-54     925
Name: age, dtype: int64

In [None]:
# Dropping Null Values
df = df.dropna()
unbiased_data = df[df.gender != 'u'].copy()
unbiased_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17452 entries, 0 to 19345
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       17452 non-null  object
 1   gender    17452 non-null  object
 2   x         17452 non-null  int64 
 3   y         17452 non-null  int64 
 4   dx        17452 non-null  int64 
 5   dy        17452 non-null  int64 
 6   img_path  17452 non-null  object
dtypes: int64(4), object(3)
memory usage: 1.1+ MB


In [None]:
gender_to_label_map = {
    'f' : 0,
    'm' : 1
}

age_to_label_map = {
    '0-3'  :0,
    '4-7'  :1,
    '8-14' :2,
    '15-24':3,
    '25-34':4,
    '35-44':5,
    '45-54':6,
    '55+'  :7
}

label_to_age_map = {v: k for k, v in age_to_label_map.items()}

# label_to_age_map = {value: key for key, value in age_to_label_map.items()}
# label_to_gender_map = {value: key for key, value in gender_to_label_map.items()}

unbiased_data['age'] = unbiased_data['age'].apply(lambda age: age_to_label_map[age])
unbiased_data['gender'] = unbiased_data['gender'].apply(lambda g: gender_to_label_map[g])
unbiased_data.head()

Unnamed: 0,age,gender,x,y,dx,dy,img_path
0,4,0,0,414,1086,1383,/content/adience-benchmark-gender-and-age-clas...
1,4,1,301,105,640,641,/content/adience-benchmark-gender-and-age-clas...
2,4,0,2395,876,771,771,/content/adience-benchmark-gender-and-age-clas...
3,4,1,752,1255,484,485,/content/adience-benchmark-gender-and-age-clas...
4,4,1,175,80,769,768,/content/adience-benchmark-gender-and-age-clas...


In [None]:
# Unique Ages
unbiased_data.age.unique()

array([4, 5, 1, 7, 3, 6, 2, 0])

In [None]:
# Splitting the Data into Train and test set
X = unbiased_data[['img_path']]
y = unbiased_data[['gender']]
yy=keras.utils.to_categorical(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('Train data shape {}'.format(X_train.shape))
print('Test data shape {}'.format(X_test.shape))

Train data shape (12216, 1)
Test data shape (5236, 1)


In [None]:
# Remove the Index from X_train
X_train=X_train.reset_index()
X_train=X_train.drop('index',axis=1)
X_train

Unnamed: 0,img_path
0,/content/adience-benchmark-gender-and-age-clas...
1,/content/adience-benchmark-gender-and-age-clas...
2,/content/adience-benchmark-gender-and-age-clas...
3,/content/adience-benchmark-gender-and-age-clas...
4,/content/adience-benchmark-gender-and-age-clas...
...,...
12211,/content/adience-benchmark-gender-and-age-clas...
12212,/content/adience-benchmark-gender-and-age-clas...
12213,/content/adience-benchmark-gender-and-age-clas...
12214,/content/adience-benchmark-gender-and-age-clas...


In [None]:
# Remove the index from y_train
y_train=y_train.reset_index()
y_train=y_train.drop('index',axis=1)
y_train

Unnamed: 0,gender
0,1
1,0
2,0
3,1
4,0
...,...
12211,1
12212,0
12213,0
12214,0


In [None]:
# Creating The Data Frame for train images and 
train=pd.DataFrame({'Image_path':X_train.iloc[:,0],'Gender':y_train.iloc[:,0]},index=X_train.index)
train

Unnamed: 0,Image_path,Gender
0,/content/adience-benchmark-gender-and-age-clas...,1
1,/content/adience-benchmark-gender-and-age-clas...,0
2,/content/adience-benchmark-gender-and-age-clas...,0
3,/content/adience-benchmark-gender-and-age-clas...,1
4,/content/adience-benchmark-gender-and-age-clas...,0
...,...,...
12211,/content/adience-benchmark-gender-and-age-clas...,1
12212,/content/adience-benchmark-gender-and-age-clas...,0
12213,/content/adience-benchmark-gender-and-age-clas...,0
12214,/content/adience-benchmark-gender-and-age-clas...,0


In [None]:
# Doing the Same for test Data 
X_test=X_test.reset_index()
X_test=X_test.drop('index',axis=1)
y_test=y_test.reset_index()
y_test=y_test.drop('index',axis=1)
X_test,y_test

(                                               img_path
 0     /content/adience-benchmark-gender-and-age-clas...
 1     /content/adience-benchmark-gender-and-age-clas...
 2     /content/adience-benchmark-gender-and-age-clas...
 3     /content/adience-benchmark-gender-and-age-clas...
 4     /content/adience-benchmark-gender-and-age-clas...
 ...                                                 ...
 5231  /content/adience-benchmark-gender-and-age-clas...
 5232  /content/adience-benchmark-gender-and-age-clas...
 5233  /content/adience-benchmark-gender-and-age-clas...
 5234  /content/adience-benchmark-gender-and-age-clas...
 5235  /content/adience-benchmark-gender-and-age-clas...
 
 [5236 rows x 1 columns],       gender
 0          1
 1          1
 2          1
 3          1
 4          1
 ...      ...
 5231       0
 5232       0
 5233       0
 5234       1
 5235       1
 
 [5236 rows x 1 columns])

In [None]:
# Create the Data Frame for test Data or Validation Data 
test=pd.DataFrame({'Image_path':X_test.iloc[:,0],'Label':y_test.iloc[:,0]},index=X_test.index)
test

Unnamed: 0,Image_path,Label
0,/content/adience-benchmark-gender-and-age-clas...,1
1,/content/adience-benchmark-gender-and-age-clas...,1
2,/content/adience-benchmark-gender-and-age-clas...,1
3,/content/adience-benchmark-gender-and-age-clas...,1
4,/content/adience-benchmark-gender-and-age-clas...,1
...,...,...
5231,/content/adience-benchmark-gender-and-age-clas...,0
5232,/content/adience-benchmark-gender-and-age-clas...,0
5233,/content/adience-benchmark-gender-and-age-clas...,0
5234,/content/adience-benchmark-gender-and-age-clas...,1


# Data Split

In [None]:
from keras.utils import to_categorical

In [None]:
n_classes = 8
column_labels_name = [label_to_age_map[x] for x in range(n_classes)]
column_labels_name

['0-3', '4-7', '8-14', '15-24', '25-34', '35-44', '45-54', '55+']

In [None]:
x = unbiased_data[['img_path']]
y = unbiased_data[['age']]
#y=keras.utils.to_categorical(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
print('Train data shape {}'.format(X_train.shape))
print('test data shape {}'.format(X_test.shape))

Train data shape (12216, 1)
test data shape (5236, 1)


In [None]:
one_hot = pd.DataFrame(to_categorical(y_train["age"], n_classes))
  
y_train=y_train.reset_index()
y_train=y_train.drop('index',axis=1)

one_hot.columns = column_labels_name # change labels name
y_train = y_train.drop("age", axis = 1) # drop old not hot-encoded columns 
y_train = pd.concat([y_train, one_hot], axis=1) # add new hot-encoded columns 


y_train

Unnamed: 0,0-3,4-7,8-14,15-24,25-34,35-44,45-54,55+
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
12211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12212,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12213,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
12214,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
y_test=y_test.reset_index()
y_test=y_test.drop('index',axis=1)

one_hot = pd.DataFrame(to_categorical(y_test["age"], n_classes))

one_hot.columns = column_labels_name # change labels name
y_test = y_test.drop("age", axis = 1) # drop old not hot-encoded columns 
y_test = pd.concat([y_test, one_hot], axis=1) # add new hot-encoded columns 


y_test

Unnamed: 0,0-3,4-7,8-14,15-24,25-34,35-44,45-54,55+
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
5231,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5232,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5233,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5234,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
test.columns=['Image_path','Gender_label']
train.columns=['Image_path','Gender_label']
# test.columns=['Image_path','Gender_label']

In [None]:
# train_t=train.copy()
# test_t=test.copy()

In [None]:
train[column_labels_name]=y_train
test[column_labels_name]=y_test

train.to_csv('train.csv')
test.to_csv('test.csv')

# Data Augmentation

The Data Augmentation phase provides a categorization of the labels into one-hot encoded representation.

Then we apply an Image Data Generator to the Training set and Test set to apply the Preprocessing function specific for the Convolutional Neural Network that we are going to exploit in the next phase.

In this phase, the Training set is also split into the 80-20 partition previously described.

In [None]:
TOTAL_LABEL=["Image_path"]+column_labels_name
TOTAL_LABEL

['Image_path', '0-3', '4-7', '8-14', '15-24', '25-34', '35-44', '45-54', '55+']

In [None]:
train_age_df = train[TOTAL_LABEL]
train_gender_df = train[["Image_path", "Gender_label"]]

test_age_df = test[TOTAL_LABEL]
test_gender_df = test[["Image_path", "Gender_label"]]

train_age_df.head()

Unnamed: 0,Image_path,0-3,4-7,8-14,15-24,25-34,35-44,45-54,55+
0,/content/adience-benchmark-gender-and-age-clas...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,/content/adience-benchmark-gender-and-age-clas...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,/content/adience-benchmark-gender-and-age-clas...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,/content/adience-benchmark-gender-and-age-clas...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,/content/adience-benchmark-gender-and-age-clas...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
# converting back to string because CNNs need a categorical input
train_age_df[column_labels_name] = train_age_df[column_labels_name].astype(int)
test_age_df[column_labels_name] = test_age_df[column_labels_name].astype(int)

train_gen = tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function = tf.keras.applications.efficientnet_v2.preprocess_input,
                                                            validation_split=0.2)
test_gen = tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function = tf.keras.applications.efficientnet_v2.preprocess_input)
BATCH_SIZE=64
IMG_SIZE = 224
RANDOM_SEED = 42

train_generator = train_gen.flow_from_dataframe(
    dataframe=train_age_df,
    directory="",
    x_col="Image_path",
    y_col=column_labels_name,
    subset="training",
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE, # 32 default
    class_mode='raw',
    shuffle=True, 
    seed=RANDOM_SEED,
    )

val_generator = train_gen.flow_from_dataframe(
    dataframe=train_age_df,
    directory="",
    x_col="Image_path",
    y_col=column_labels_name,
    subset="validation",
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE, # 32 default
    class_mode='raw',
    shuffle=True, 
    seed=RANDOM_SEED,
    )

test_generator = test_gen.flow_from_dataframe(
    dataframe=test_age_df,
    directory="",
    x_col="Image_path",
    y_col=column_labels_name,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='raw',
    shuffle=True, 
    seed=RANDOM_SEED,
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_age_df[column_labels_name] = train_age_df[column_labels_name].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_age_df[column_labels_name] = test_age_df[column_labels_name].astype(int)


Found 9773 validated image filenames.
Found 2443 validated image filenames.
Found 5236 validated image filenames.
