In [1]:
import numpy as np
from scipy.io import loadmat
import pandas as pd
import os
import datetime as date
from dateutil.relativedelta import relativedelta
from concurrent.futures import ProcessPoolExecutor
import cv2


In [2]:
datasetsFolder = './new'

In [3]:
# https://raw.githubusercontent.com/imdeepmind/processed-imdb-wiki-dataset/master/mat.py
# https://github.com/imdeepmind/processed-imdb-wiki-dataset

cols = ['age', 'gender', 'path', 'face_score1', 'face_score2']



imdb_mat = os.path.join(datasetsFolder,'imdb_crop', 'imdb.mat')
wiki_mat = os.path.join(datasetsFolder,'wiki_crop', 'wiki.mat')

imdb_data = loadmat(imdb_mat)
wiki_data = loadmat(wiki_mat)

del imdb_mat, wiki_mat

imdb = imdb_data['imdb']
wiki = wiki_data['wiki']

imdb_photo_taken = imdb[0][0][1][0]
imdb_full_path = imdb[0][0][2][0]
imdb_gender = imdb[0][0][3][0]
imdb_face_score1 = imdb[0][0][6][0]
imdb_face_score2 = imdb[0][0][7][0]

wiki_photo_taken = wiki[0][0][1][0]
wiki_full_path = wiki[0][0][2][0]
wiki_gender = wiki[0][0][3][0]
wiki_face_score1 = wiki[0][0][6][0]
wiki_face_score2 = wiki[0][0][7][0]

imdb_path = []
wiki_path = []

for path in imdb_full_path:
    imdb_path.append(os.path.join(datasetsFolder,'imdb_crop', path[0]))

for path in wiki_full_path:
    wiki_path.append(os.path.join(datasetsFolder,'wiki_crop', path[0]))

imdb_genders = []
wiki_genders = []

for n in range(len(imdb_gender)):
    if imdb_gender[n] == 1:
        imdb_genders.append('male')
    else:
        imdb_genders.append('female')

for n in range(len(wiki_gender)):
    if wiki_gender[n] == 1:
        wiki_genders.append('male')
    else:
        wiki_genders.append('female')

imdb_dob = []
wiki_dob = []

for file in imdb_path:
    temp = file.split('_')[3]
    temp = temp.split('-')
    if len(temp[1]) == 1:
        temp[1] = '0' + temp[1]
    if len(temp[2]) == 1:
        temp[2] = '0' + temp[2]

    if temp[1] == '00':
        temp[1] = '01'
    if temp[2] == '00':
        temp[2] = '01'

    imdb_dob.append('-'.join(temp))

for file in wiki_path:
    wiki_dob.append(file.split('_')[2])


imdb_age = []
wiki_age = []

for i in range(len(imdb_dob)):
    try:
        d1 = date.datetime.strptime(imdb_dob[i][0:10], '%Y-%m-%d')
        d2 = date.datetime.strptime(str(imdb_photo_taken[i]), '%Y')
        rdelta = relativedelta(d2, d1)
        diff = rdelta.years
    except Exception as ex:
        print(ex)
        diff = -1
    imdb_age.append(diff)

for i in range(len(wiki_dob)):
    try:
        d1 = date.datetime.strptime(wiki_dob[i][0:10], '%Y-%m-%d')
        d2 = date.datetime.strptime(str(wiki_photo_taken[i]), '%Y')
        rdelta = relativedelta(d2, d1)
        diff = rdelta.years
    except Exception as ex:
        print(ex)
        diff = -1
    wiki_age.append(diff)

final_imdb = np.vstack((imdb_age, imdb_genders, imdb_path,
                       imdb_face_score1, imdb_face_score2)).T
final_wiki = np.vstack((wiki_age, wiki_genders, wiki_path,
                       wiki_face_score1, wiki_face_score2)).T

final_imdb_df = pd.DataFrame(final_imdb)
final_wiki_df = pd.DataFrame(final_wiki)

final_imdb_df.columns = cols
final_wiki_df.columns = cols

meta = pd.concat((final_imdb_df, final_wiki_df))

meta = meta[meta['face_score1'] != '-inf']
meta = meta[meta['face_score2'] == 'nan']

meta = meta.drop(['face_score1', 'face_score2'], axis=1)

meta = meta.sample(frac=1)

meta.to_csv((os.path.join(datasetsFolder, 'meta.csv')), index=False) # 224841 plikow


time data '0-12-22' does not match format '%Y-%m-%d'
time data '0-05-21' does not match format '%Y-%m-%d'
time data '0-11-18' does not match format '%Y-%m-%d'
time data '0-11-18' does not match format '%Y-%m-%d'
time data '0-11-18' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-%m-%d'
time data '0-07-31' does not match format '%Y-

In [4]:
# Zaladowanie modelu wykrywania twarzy
modelFile = "../res10_300x300_ssd_iter_140000_fp16.caffemodel"
configFile = "../deploy.prototxt"
net = cv2.dnn.readNetFromCaffe(configFile, modelFile)

In [5]:
# Fragment z repozytorium processed-imdb-wiki-dataset
# https://github.com/imdeepmind/processed-imdb-wiki-dataset
# https://raw.githubusercontent.com/imdeepmind/processed-imdb-wiki-dataset/master/age.py

output_dir = os.path.join(datasetsFolder, 'imdbwiki')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loading dataset
meta = pd.read_csv(os.path.join(datasetsFolder, 'meta.csv'));

# Dropping gender column
meta=meta.drop(['gender'], axis=1)

# Filtaring dataset
meta=meta[meta['age'] >= 0]
meta=meta[meta['age'] <= 101]

# Converting into numpy array
meta=meta.values
# koniec fragmentu

def getDetections(f):
    h, w, _ = f.shape
    y = 0
    y2 = h
    x = 0
    x2 = w
    if(h > w):
        m = (h - w) // 2
        y = m - w // 2
        y2 = m + w // 2
    elif(h < w):
         m=(w - h) // 2
         y=m - h // 2
         y2=m + h // 2
    f = f[y:y2, x:x2]
    blob=cv2.dnn.blobFromImage(cv2.resize(f, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
    # (104.0, 177.0, 123.0) - mean subtraction values
    net.setInput(blob)
    detections=net.forward()
    return (f,detections)

def process_image(age_image_tuple):                 # wykrywanie twarzy i wycinanie zawierajacego ja, wysrodkowanego kwadratu przeskalowanego do 200x200
    try:                                            # zdjecia, na ktorych nie wykryto twarzy, albo wykryto wiecej niz jedna twarz, sa odrzucane
        age, path = age_image_tuple                 # odrzucane sa tez twarze w rozdzielczosci po wycieciu mniejszej niz 200x200(przed przeskalowaniem)
        newpath = os.path.basename(path)            
        newpath = os.path.join(output_dir, str(age) + '_' + newpath)
        if(os.path.exists(newpath)): # przydatne jak przerwiemy w trakcie; pominie juz przetworzone zdjecia
            return
        f=cv2.imread(path, 1)
        h, w, _ = f.shape
        if(min(w, h)<200):
            return
        f, detections = getDetections(f)
        h, w, _ = f.shape
        if(min(w, h) < 200):
            return
        face_count=0
        for i in range(detections.shape[2]):
            confidence=detections[0, 0, i, 2]
            if confidence > 0.5:
                face_count += 1
        if (face_count==1): # wykryto tylko jedna twarz
            for i in range(detections.shape[2]):
                confidence = detections[0, 0, i, 2]
                if confidence > 0.5:
                    box = detections[0, 0, i, 3:7] * np.array([f.shape[1], f.shape[0], f.shape[1], f.shape[0]])
                    (x, y, x2, y2) = box.astype("int") # prostokat zawierajacy wysrodkowana twarz
                    r = (x2-x)-(y2-y)
                    if(r>0): # obliczanie wspolzednych kawdratu zawierajacego ten prostokat w celu unikniecia rozciagania przy skalowaniu do 200x200 
                        m = (y+y2)//2
                        y = m - (x2-x)//2
                        y2 = m + (x2-x)//2
                    elif(r<0):
                        m = (x+x2)//2
                        x= m - (y2-y)//2
                        x2 = m + (y2-y)//2
                    if(x<0 or y<0 or x2 >= w or y2 >= h):
                        continue
                    extracted_face = f[y:y2, x:x2]
                    height, width, channels = extracted_face.shape
                    if(max(width, height)<200): # odrzucanie zdjec ktore byly by rozpikselowane
                        continue
                    extracted_face = cv2.resize(extracted_face, (200,200))
                    cv2.imwrite(newpath, extracted_face)
    except:
        return

In [6]:
import multiprocessing
import time
# 34 minuty
# for image_path in meta:
#     process_image(image_path)

pool = multiprocessing.Pool() # szybciej
pool.map(process_image, meta)
# 224576 wejsciowych zdjec
# po przetworzeniu 66832 zdjec

pool.close()
pool.join()


In [7]:
# pool = multiprocessing.Pool()


# start_time = time.time()
# pool.map(process_image, meta)
# end_time = time.time()

# pool.close()
# pool.join()

# print(f"The operation took {end_time - start_time} seconds")

In [8]:
# len(meta)