# DATA PREPARATION

In [None]:
#Importing the libraries
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import keras
import shutil, glob
from zipfile import ZipFile
import imgaug.augmenters as iaa

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Unzipping the Kaggle Dataset
facial_age_zip_path = "/content/drive/MyDrive/Data/KaggleFacialAge.zip"

with ZipFile(facial_age_zip_path, 'r') as myzip:
    myzip.extractall()
    print('Done unzipping KaggleFacialAge.zip')

Done unzipping KaggleFacialAge.zip


In [None]:
#Creating an array of all the folders name of the dataset
kaggle_faces_path = "../content/KaggleFacialAge"
kaggle_faces_folders = os.listdir(kaggle_faces_path)
kaggle_faces_folders = np.array(kaggle_faces_folders)

In [None]:
#Creating Dictionary Containing Number of Images per Age for Kaggle Dataset
kaggle_images = {}

for folder in kaggle_faces_folders:
    temp_path = os.path.join(kaggle_faces_path, folder)
    n_images = len(os.listdir(temp_path))
    kaggle_images[int(folder)] = n_images  

In [None]:
#Unzipping the UTK Dataset
facial_age_zip_path = "/content/drive/MyDrive/Data/UTKFaces.zip"

with ZipFile(facial_age_zip_path, 'r') as myzip:
    myzip.extractall()
    print('Done unzipping UTKFaces.zip')

Done unzipping UTKFaces.zip


In [None]:
#Creating Dictionary Containing Number of Images per Age for UTK Dataset
utk_images = {}
utk_faces_path = "../content/UTKFaces"
utkface_image_names = os.listdir(utk_faces_path)

def finding_age(image_name):
    image_labels = image_name.split('_')
    age = int(image_labels[0])
    return age

for image in utkface_image_names:
    age = finding_age(image)
    if age not in utk_images:
        utk_images[age] = 1
    else:
        utk_images[age] += 1

#Sorting the dict
utk_images = dict(sorted(utk_images.items(), key=lambda x: x[0]))    

In [None]:
#Merging the Dictionaries 
def mergeDict(dict_1, dict_2):
    merged_dict = {**dict_1, **dict_2}
    for key, value in merged_dict.items():
        if key in dict_1 and key in dict_2:
            merged_dict[key] = dict_1[key] + dict_2[key]
    return merged_dict        

combined_images = mergeDict(kaggle_images , utk_images)
combined_images = dict(sorted(combined_images.items(), key=lambda x: x[0]))  
#CREARE UNA FUNZIONE CHE MOSTRI LE PRIME 10 RIGHE DI UN DIZIONARIO, PER MOSTRARLI TUTTI E 3

In [None]:
#Showing the totoal number of the images
total_img = 0

for k, v in combined_images.items():
    total_img += v
    
total_img

33884

In [None]:
#Create a function to divide the images into n groups with similar number of images

def groups(diz, n_groups):
    total_img = 0
    for k, v in diz.items():
        total_img += v
    threshold = int(total_img / (n_groups)) 
    
    groups = []
    tempor = []
    val = 0
    
    for key, value in diz.items():
        val += diz[key]
        if val < threshold*1.2:
            tempor.append(key)
        else:
            groups.append(tempor)
            val = value
            tempor = [key]
           
    groups.append(tempor)
    return groups        

In [None]:
# Function to divide images in defined groups 

def defined_groups(diz, *age_ranges):
  index = 0
  groups = []
  range = []
  for k, v in diz.items():
      if index + 1 > len(age_ranges) or k <= age_ranges[index]:
          range.append(k)
      else:
          groups.append(range)
          range = [k]
          index += 1
  groups.append(range)

  return groups                     
    
#grs = groups(combined_images, 2, 9, 18, 25, 40, 55, 65)   

In [None]:
grs1 = groups(combined_images, 7)
grs2 = defined_groups(combined_images, 2, 9, 20, 27, 45, 65)   
grs3 = defined_groups(combined_images, 9, 19, 29, 45, 59, 75)   

In [None]:
#Creazione lista contenente le info nel punto 1
#Parlare riguardo class imbalance
def info_ranges(grs):
    res = []

    for index, value in enumerate(grs):
        temp = 0
        for num in value:
            temp += combined_images[num]
        res.append([index, str(value[0])+'-'+str(value[-1]), temp, str(round(temp/total_img*100,3)) + '%'])

    return res

In [None]:
res1 = info_ranges(grs1)
res2 = info_ranges(grs2)
res3 = info_ranges(grs3)

In [None]:
print('--- GROUP 1 INFO ---')
print(res1)
print('--- GROUP 2 INFO ---')
print(res2)
print('--- GROUP 3 INFO ---')
print(res3)

--- GROUP 1 INFO ---
[[0, '1-7', 5441, '16.058%'], [1, '8-23', 5538, '16.344%'], [2, '24-27', 5170, '15.258%'], [3, '28-35', 5564, '16.421%'], [4, '36-52', 5776, '17.046%'], [5, '53-84', 5650, '16.675%'], [6, '85-116', 745, '2.199%']]
--- GROUP 2 INFO ---
[[0, '1-2', 3400, '10.034%'], [1, '3-9', 2880, '8.5%'], [2, '10-20', 3161, '9.329%'], [3, '21-27', 6708, '19.797%'], [4, '28-45', 9368, '27.647%'], [5, '46-65', 5624, '16.598%'], [6, '66-116', 2743, '8.095%']]
--- GROUP 3 INFO ---
[[0, '1-9', 6280, '18.534%'], [1, '10-19', 2773, '8.184%'], [2, '20-29', 8883, '26.216%'], [3, '30-45', 7581, '22.373%'], [4, '46-59', 4168, '12.301%'], [5, '60-75', 2801, '8.266%'], [6, '76-116', 1398, '4.126%']]


In [None]:
#Trasformare la lista in un dataframe e poi csv
df1 = pd.DataFrame(res1, columns =['Label', 'Age-Range', '#Images', '%Total']) 
df2 = pd.DataFrame(res2, columns =['Label', 'Age-Range', '#Images', '%Total']) 
df3 = pd.DataFrame(res3, columns =['Label', 'Age-Range', '#Images', '%Total']) 
#os.makedirs('Stats', exist_ok=True) 
#df.to_csv('Stats/data_stats.csv')
print(df1)
print('---------------')
print(df2)
print('---------------')
print(df3)

   Label Age-Range  #Images   %Total
0      0       1-7     5441  16.058%
1      1      8-23     5538  16.344%
2      2     24-27     5170  15.258%
3      3     28-35     5564  16.421%
4      4     36-52     5776  17.046%
5      5     53-84     5650  16.675%
6      6    85-116      745   2.199%
---------------
   Label Age-Range  #Images   %Total
0      0       1-2     3400  10.034%
1      1       3-9     2880     8.5%
2      2     10-20     3161   9.329%
3      3     21-27     6708  19.797%
4      4     28-45     9368  27.647%
5      5     46-65     5624  16.598%
6      6    66-116     2743   8.095%
---------------
   Label Age-Range  #Images   %Total
0      0       1-9     6280  18.534%
1      1     10-19     2773   8.184%
2      2     20-29     8883  26.216%
3      3     30-45     7581  22.373%
4      4     46-59     4168  12.301%
5      5     60-75     2801   8.266%
6      6    76-116     1398   4.126%


In [None]:
#Mettere immagini in un'unica cartella sistemando il nome di ogni immagine

#Creazione della cartella 
comb_path = "../content/Combined_Images"
if not os.path.exists(comb_path):
    os.makedirs(comb_path)

#Copiare le immagini del dataset UTK
src_dir = utk_faces_path
counter = 0
print('--- Copying UTK dataset images into combined images folder ---\n')
for jpgfile in glob.iglob(os.path.join(src_dir, "*.jpg")):
    shutil.copy(jpgfile, comb_path)
    counter += 1
    if counter % 3000 == 0:
        print('Images copied to combined images folder = {} of 33884'.format(counter)) 

--- Copying UTK dataset images into combined images folder ---

Images copied to combined images folder = 3000 of 33884
Images copied to combined images folder = 6000 of 33884
Images copied to combined images folder = 9000 of 33884
Images copied to combined images folder = 12000 of 33884
Images copied to combined images folder = 15000 of 33884
Images copied to combined images folder = 18000 of 33884
Images copied to combined images folder = 21000 of 33884
Images copied to combined images folder = 24000 of 33884


In [None]:
#Sistemare e copiare le immagini del dateset Kaggle

src_dir = kaggle_faces_path
print('--- Copying Kaggle dataset images into combined images folder ---\n')
for folder in os.listdir(kaggle_faces_path):
    for pngfile in glob.iglob(os.path.join(src_dir + '/' + folder, "*.png")):
        pngfile_id = pngfile.split("/")[-1]
        destpath = comb_path + '/' + str(int(folder)) + '_' + pngfile_id[:-4] + '.jpg'

        # Loading .png image
        png_img = cv2.imread(pngfile)
        png_img = cv2.resize(png_img, dsize=(50, 50))
        
        # converting to jpg file
        #saving the jpg file
        cv2.imwrite(destpath, png_img, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
        
        counter += 1
        if counter % 3000 == 0:
            print('Images copied to combined images folder = {} of 33884'.format(counter))
        
print('Images copied to combined images folder = {} of 33884'.format(counter))

--- Copying Kaggle dataset images into combined images folder ---

Images copied to combined images folder = 27000 of 33884
Images copied to combined images folder = 30000 of 33884
Images copied to combined images folder = 33000 of 33884
Images copied to combined images folder = 33884 of 33884


In [None]:
#Costruisco un dataframe con nome file, età e label
def label_map(age, res):  #info_range is the res list above
    age = int(age)
    if age >=int(res[0][1].split('-')[0]) and age <=int(res[0][1].split('-')[1]):
        return 0
    elif age >=int(res[1][1].split('-')[0]) and age <=int(res[1][1].split('-')[1]):
        return 1
    elif age >=int(res[2][1].split('-')[0]) and age <=int(res[2][1].split('-')[1]):
        return 2
    elif age >=int(res[3][1].split('-')[0]) and age <=int(res[3][1].split('-')[1]):
        return 3
    elif age >=int(res[4][1].split('-')[0]) and age <=int(res[4][1].split('-')[1]):
        return 4
    elif age >=int(res[5][1].split('-')[0]) and age <=int(res[5][1].split('-')[1]):
        return 5
    elif age >=int(res[6][1].split('-')[0]) and age <=int(res[6][1].split('-')[1]):
        return 6
    else:
        return 'Age out of bound!'

In [None]:
res = []
for image in os.listdir(comb_path):
    age = int(image.split('_')[0])
    label = label_map(age, res1)
    res.append([image, age, label])
    
#Trasformare la lista in un dataframe e poi csv e salvarlo
df1 = pd.DataFrame(res, columns =['FileName', 'Age', 'Label'])  
#df.to_csv('Stats/filenames_label.csv')
print(df1)  

                           FileName  Age  Label
0                       28_2706.jpg   28      3
1       1_0_2_20161219161919870.jpg    1      0
2      64_0_0_20170117194627251.jpg   64      5
3      29_1_1_20170112235118232.jpg   29      3
4      15_0_0_20170117134820249.jpg   15      1
...                             ...  ...    ...
33879                    4_3549.jpg    4      0
33880  26_1_3_20170119192406609.jpg   26      2
33881                   24_1227.jpg   24      2
33882  58_0_1_20170111200022771.jpg   58      5
33883                   72_6632.jpg   72      5

[33884 rows x 3 columns]


In [None]:
#Funzione per ottenere la label dal nome

def get_label(filename, res):
    age = filename.split('_')[0]
    return str(label_map(age, res))

In [None]:
#Creo delle sottocartelle divise per label per poi utilizzare la funzione di tensorflow che 
#recupera la label associata all'immagine dal nome della sottocartella

#CREAZIONE DELLE 3 CARTELLE
# ----- CARTELLA 1 -----
comb_path1 = comb_path + '1'
#Creazione della cartella 
if not os.path.exists(comb_path1):
    os.makedirs(comb_path1)
for image in os.listdir(comb_path):
    if not os.path.exists(os.path.join(comb_path1, get_label(image, res1))):      
        os.makedirs(os.path.join(comb_path1,get_label(image, res1)))  
    shutil.copy(os.path.join(comb_path,image),os.path.join(comb_path1, get_label(image, res1),image))

In [None]:
#Creo la cartella Combined_Images1.zip e la inserisco sul Google Drive
output_filename = '../content/drive/MyDrive/Data/Combined_Images1'
shutil.make_archive(output_filename, 'zip', comb_path1)

'/content/drive/MyDrive/Data/Combined_Images1.zip'

In [None]:
# ----- CARTELLA 2 -----
comb_path2 = comb_path + '2'
#Creazione della cartella 
if not os.path.exists(comb_path2):
    os.makedirs(comb_path2)
for image in os.listdir(comb_path):
    if not os.path.exists(os.path.join(comb_path2, get_label(image, res2))):      
        os.makedirs(os.path.join(comb_path2,get_label(image, res2)))  
    shutil.copy(os.path.join(comb_path,image),os.path.join(comb_path2, get_label(image, res2),image))

In [None]:
#Creo la cartella Combined_Images1.zip e la inserisco sul Google Drive
output_filename = '../content/drive/MyDrive/Data/Combined_Images2'
shutil.make_archive(output_filename, 'zip', comb_path2)    

'/content/drive/MyDrive/Data/Combined_Images2.zip'

In [None]:
# ----- CARTELLA 3 -----
comb_path3 = comb_path + '3'
#Creazione della cartella 
if not os.path.exists(comb_path3):
    os.makedirs(comb_path3)
for image in os.listdir(comb_path):
    if not os.path.exists(os.path.join(comb_path3, get_label(image, res3))):      
        os.makedirs(os.path.join(comb_path3,get_label(image, res3)))  
    shutil.copy(os.path.join(comb_path,image),os.path.join(comb_path3, get_label(image, res3),image)) 

In [None]:
#Creo la cartella Combined_Images1.zip e la inserisco sul Google Drive
output_filename = '../content/drive/MyDrive/Data/Combined_Images3'
shutil.make_archive(output_filename, 'zip', comb_path3)

'/content/drive/MyDrive/Data/Combined_Images3.zip'