### 1. 데이터 로드

### 2. 데이터 확인

In [3]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob
import seaborn as sns
import plotly.graph_objects as go

In [5]:
# 숫자로 표현되어 있는 인종, 성별 데이터를 문자열로 변환해주기 위한 dict
dataset_dict = {
    'race_id': {
        0: 'white',
        1: 'black',
        2: 'asian',
        3: 'indian',
        4: 'others'
    },
    'gender_id': {
        0: 'male',
        1: 'female'
    }
}

dataset_dict['gender_alias'] = dict((g, i) for i, g in dataset_dict['gender_id'].items())
dataset_dict['race_alias'] = dict((g, i) for i, g in dataset_dict['race_id'].items())

In [8]:
dataset_dict

{'race_id': {0: 'white', 1: 'black', 2: 'asian', 3: 'indian', 4: 'others'},
 'gender_id': {0: 'male', 1: 'female'},
 'gender_alias': {'male': 0, 'female': 1},
 'race_alias': {'white': 0, 'black': 1, 'asian': 2, 'indian': 3, 'others': 4}}

In [9]:
# 데이터 프레임으로 만들어주기 위한 함수 지정
def parse_dataset(dataset_path, ext='jpg'):

  def parse_info_from_file(path):
    try:
      filename = os.path.split(path)[1] # 폴더명[0]과 파일명[1]을 나누어 그 중 파일명을 가져옴
      filename = os.path.splitext(filename)[0] # 파일명[0]과 확장자[1]를 분리하여 그 중 파일명[0]만 가져옴
      age, gender, race, _ = filename.split('_') # 사진 파일명은 '나이_성별_인종_날짜' 정보로 이루어져있음 이 중 앞의 3개만 사용
      return int(age), dataset_dict['gender_id'][int(gender)], dataset_dict['race_id'][int(race)] # 나이, 성별(문자), 인종(문자)로 반환
    except Exception as ex:
      return None, None, None

  files = glob(os.path.join(dataset_path, "*.%s" % ext)) # 폴더명에 존재하는 jpg 확장자의 파일을 리스트로 반환

  records = []
  for file in files:
    info = parse_info_from_file(file)
    records.append(info)

  df = pd.DataFrame(records)
  df['file'] = files
  df.columns = ['age', 'gender', 'race','file']
  df = df.dropna()

  return df

folder_name = 'UTKFace'
df = parse_dataset(folder_name)


In [10]:
import cv2
import random

# opencv는 BGR 순서이지만, matplotlib은 RGB 순서임
# 따라서 BGR 순서를 RGB 순서로 바꿔주어야 함.

random_file = random.sample(files,10)

fig = plt.figure(figsize=(10, 10))

for i, file in enumerate(random_file):
  image = cv2.imread(file, cv2.COLOR_BGR2RGB)
  plt.subplot(1, 10, i+1)
  plt.imshow(image)

TypeError: Population must be a sequence.  For dicts or sets, use sorted(d).

### 3. CNN 모델 만들기

In [9]:
import os
import cv2
import numpy as np

from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import BatchNormalization, Conv2D, MaxPool2D, Activation, Dropout, Lambda, Dense, Flatten, Input
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as k

In [10]:
# 이미지 데이터와 클래스(나이, 성별) 불러오기
folder_name = 'UTKFace'

images = []
age = []
gender = []

for img in os.listdir(folder_name): # 폴더 내 이미지 파일들 불러오기
  ages = img.split("_")[0] # 나이 정보(숫자)
  genders = img.split("_")[1] # 성별 정보(숫자)
  img = cv2.imread(str(folder_name)+"/"+str(img)) # 폴더명/이미지명으로 이미지 load
  img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB) # BGR -> RGB
  img = img # ?
  images.append(np.array(img)) # 이미지 배열 추가
  age.append(np.array(ages)) # 나이 배열 추가
  gender.append(np.array(genders)) # 성별 배열 추가

age = np.array(age,dtype=np.int64)
images = np.array(images)
gender = np.array(gender,np.uint64)


In [11]:
# 학습 데이터셋 6 : 검증 데이터셋 2 : 테스트 데이터셋 2

# 나이
x_train_age, x_test_age, y_train_age, y_test_age = train_test_split(images, age, random_state=42, test_size=0.4)
x_valid_age, x_test_age, y_valid_age, y_test_age = train_test_split(x_test_age, y_test_age, random_state=42, test_size=0.4)

# 성별
x_train_gender, x_test_gender, y_train_gender, y_test_gender = train_test_split(images, gender, random_state=42, test_size=0.4)
x_valid_gender, x_test_gender, y_valid_gender, y_test_gender = train_test_split(x_test_gender, y_test_gender, random_state=42, test_size=0.4)


In [21]:
#learning rate 설정
init_lr = 1e-4
epochs = 50
opt = Adam(learning_rate=init_lr)


# 과적합 방지를 위한 callback 설정
callbacks = [EarlyStopping(monitor='val_loss',patience=5),
             ModelCheckpoint(filepath='./model-{epoch:02d}-{val_loss:.2f}.keras',
                                             monitor='val_loss',
                                             save_best_only=True)]

# age_model
age_model = Sequential([
    Conv2D(128, kernel_size =3, input_shape=(200,200,3)),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),

    Conv2D(128, kernel_size =3),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),

    Conv2D(256, kernel_size =3),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),

    Conv2D(512, kernel_size =3),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),

    Flatten(),
    Dropout(0.25),
    Dense(512, activation  = 'relu'),
    Dense(1, activation = 'linear', name = 'age') # 선형함수
    ])
age_model.compile(loss="mse", optimizer=opt, metrics = ['mae'])

# gender model

gender_model = Sequential([
    Conv2D(36, kernel_size=3,input_shape=(200,200,3)),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),
    Conv2D(64, kernel_size=3),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),
    Conv2D(128, kernel_size=3),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),
    Conv2D(256, kernel_size=3),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),
    Conv2D(512, kernel_size=3),
    BatchNormalization(),
    Activation('relu'),
    MaxPool2D(pool_size=3, strides=2),
    Flatten(),
    Dropout(0.25),
    Dense(512, activation  = 'relu'),
    Dense(1, activation = 'sigmoid', name = 'gender') # 0 ~ 1사이로 압축 (확률)
])

gender_model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) # 이진분류

#age_model 학습
history_age = age_model.fit(x_train_age, y_train_age,
                        validation_data=(x_valid_age, y_valid_age), batch_size = 32, epochs=50, callbacks = callbacks)
#gender_model 학습
history_gender = gender_model.fit(x_train_gender, y_train_gender,
                        validation_data=(x_valid_gender, y_valid_gender), batch_size=32, epochs=50, callbacks = callbacks)

  super().__init__(


Epoch 1/50
[1m 41/445[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m15:54[0m 2s/step - loss: 559.7196 - mae: 18.4201

KeyboardInterrupt: 