## Clova Face Recognition API를 이용한 데이터 점검 (성별, 나이) - (1)
> 작성자: 유승리_T3129 / 작성일: 22.02.22.화
- API를 이용하여 판단한 `age`와 `gender` (각각의 `confidence` 포함)
- 데이터에 라벨링 되어있는 `age`와 `gender`

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
from pathlib import Path
import requests
import json 
import re
import pickle

pd.set_option('display.max_colwidth', 100)

### 1. Data Pipeline for Learning
- 규범님 코드입니다.
- 경로가 다를 수 있으니 수정해서 사용하시면 됩니다.

In [3]:
train_df = pd.read_csv("../../input/data/train/train.csv")

In [4]:
def age_group(x):
    if x < 30:
        return 0
    elif x < 60:
        return 1
    else:
        return 2

def age_10(x):
    if x < 20:
        return 10
    elif x < 30:
        return 20
    elif x < 40:
        return 30
    elif x < 50:
        return 40
    elif x < 60:
        return 50
    else:
        return 60

def data_pipe(df: pd.DataFrame) -> pd.DataFrame:
    df["age_group"] = df["age"].apply(age_group)
    df["age_10"] = df["age"].apply(age_10)
    return df

In [5]:
train_df = data_pipe(train_df)

In [6]:
df = pd.DataFrame(None, columns = ['gender', 'age', 'age_group', 'age_10', 'mask', 'path', 'label'])
train_dir = '../../input/data/train/images'

In [7]:
for index, line in enumerate(train_df.iloc):
    for file in list(os.listdir(os.path.join(train_dir, line['path']))):
        if file[0] == '.':
            continue
        if file.split('.')[0] == 'normal':
            mask = 2
        elif file.split('.')[0] == 'incorrect_mask':
            mask = 1
        else:
            mask = 0
        gender = 0 if line['gender'] == 'male' else 1
        data = {
            'gender': gender,
            'age': line['age'],
            'age_group': line['age_group'],
            'age_10': line['age_10'],
            'mask': mask,
            'path': os.path.join(train_dir, line['path'], file),
            'label': mask * 6 + gender * 3 + line['age_group']
        }
        df = df.append(data, ignore_index=True)

In [8]:
df

Unnamed: 0,gender,age,age_group,age_10,mask,path,label
0,1,45,1,40,0,../../input/data/train/images/000001_female_Asian_45/mask1.jpg,4
1,1,45,1,40,0,../../input/data/train/images/000001_female_Asian_45/mask4.jpg,4
2,1,45,1,40,0,../../input/data/train/images/000001_female_Asian_45/mask3.jpg,4
3,1,45,1,40,0,../../input/data/train/images/000001_female_Asian_45/mask5.jpg,4
4,1,45,1,40,1,../../input/data/train/images/000001_female_Asian_45/incorrect_mask.jpg,10
...,...,...,...,...,...,...,...
18895,0,19,0,10,0,../../input/data/train/images/006959_male_Asian_19/mask3.jpg,0
18896,0,19,0,10,0,../../input/data/train/images/006959_male_Asian_19/mask5.jpg,0
18897,0,19,0,10,1,../../input/data/train/images/006959_male_Asian_19/incorrect_mask.jpg,6
18898,0,19,0,10,0,../../input/data/train/images/006959_male_Asian_19/mask2.jpg,0


In [11]:
# df.to_csv(os.path.join(train_dir, "../new_train.csv"))

df.to_csv("./new_train.csv") # 현재 위치에 저장

In [16]:
# 마스크를 쓰지 않은 데이터에 대해서만 dataframe 다시 만들기

is_not_mask = df['mask'] == 2

no_mask_df = df[is_not_mask]
no_mask_df.reset_index(drop=True, inplace=True)

no_mask_df

Unnamed: 0,gender,age,age_group,age_10,mask,path,label
0,1,45,1,40,2,../../input/data/train/images/000001_female_Asian_45/normal.jpg,16
1,1,52,1,50,2,../../input/data/train/images/000002_female_Asian_52/normal.jpg,16
2,0,54,1,50,2,../../input/data/train/images/000004_male_Asian_54/normal.jpg,13
3,1,58,1,50,2,../../input/data/train/images/000005_female_Asian_58/normal.jpg,16
4,1,59,1,50,2,../../input/data/train/images/000006_female_Asian_59/normal.jpg,16
...,...,...,...,...,...,...,...
2695,0,19,0,10,2,../../input/data/train/images/006954_male_Asian_19/normal.jpg,12
2696,0,19,0,10,2,../../input/data/train/images/006955_male_Asian_19/normal.jpg,12
2697,0,19,0,10,2,../../input/data/train/images/006956_male_Asian_19/normal.jpg,12
2698,0,20,0,20,2,../../input/data/train/images/006957_male_Asian_20/normal.jpg,12


In [17]:
# 현재 위치에 dataframe 저장

with open( "./no_mask_df.pkl", "wb" ) as file:
    pickle.dump(no_mask_df, file)

### 2. Clova Face Recognition API 적용해보기
https://developers.naver.com/docs/clova/api/CFR/API_Guide.md#Preparation
- 저장할 요소
    - API 관련
        - `age_api`
        - `age_conf_api`
        - `gender_api`
        - `gender_conf_api`
    - 데이터 라벨링 관련
        - `age_data`
        - `gender_data`
        - `age_group_data`
        - `label`
        - `path`
- 1000장 제한이 있으므로 다음과 같이 나눠 진행 예정 (총 0~2699, 2700장)
    - `0222` - 2300~2699
    - `0223` - 1300~2299
    - `0224` - 300~1299
    - `0225` - 0~299

In [37]:
# no_mask_df.pkl를 dataframe으로 불러오기 (1번에서 만들었으면 안해도 됨)

with open( "no_mask_df.pkl", "rb" ) as file:
    no_mask_df = pickle.load(file)
no_mask_df

Unnamed: 0,gender,age,age_group,age_10,mask,path,label
0,1,45,1,40,2,../../input/data/train/images/000001_female_Asian_45/normal.jpg,16
1,1,52,1,50,2,../../input/data/train/images/000002_female_Asian_52/normal.jpg,16
2,0,54,1,50,2,../../input/data/train/images/000004_male_Asian_54/normal.jpg,13
3,1,58,1,50,2,../../input/data/train/images/000005_female_Asian_58/normal.jpg,16
4,1,59,1,50,2,../../input/data/train/images/000006_female_Asian_59/normal.jpg,16
...,...,...,...,...,...,...,...
2695,0,19,0,10,2,../../input/data/train/images/006954_male_Asian_19/normal.jpg,12
2696,0,19,0,10,2,../../input/data/train/images/006955_male_Asian_19/normal.jpg,12
2697,0,19,0,10,2,../../input/data/train/images/006956_male_Asian_19/normal.jpg,12
2698,0,20,0,20,2,../../input/data/train/images/006957_male_Asian_20/normal.jpg,12


In [38]:
compare_df = pd.DataFrame(None, columns = ['gender_data', 'gender_api', 'gender_conf_api', 
                                           'age_data', 'age_group_data', 'age_api', 'age_conf_api',
                                          'label', 'path'])

train_dir = '../../input/data/train/images'  # 환경에 맞게 수정 필요

In [39]:
# 사진 데이터 전체 

# 기록
## 02.22.화 - index 2300~2699 (계정 1)
## 02.23.수 - index 1300~2299 (계정 1)
## 02.23.수 - index 0300~1299 (계정 2)
## 02.23.수 - index 0000~0299 (계정 3)


cnt = 0  # gender가 다르게 인식된 개수

for index, line in enumerate(no_mask_df.iloc):
#     if index < 867 or index >= 1300:
#         continue
    if index >= 300:
        break

    client_id = "" #본인의 client id 
    client_secret = "" #본인의 client secret 
    
    url = "https://openapi.naver.com/v1/vision/face" # 얼굴감지
    age = 0
    files = {'image': open(line["path"], 'rb')} # open(directory/filename)
    headers = {'X-Naver-Client-Id': client_id, 'X-Naver-Client-Secret': client_secret }
    response = requests.post(url,  files=files, headers=headers)
    rescode = response.status_code
    # print(rescode) # 혹시 작동을 안하면 하루 최대 할당량 1000장을 넘었을 수도.. 그때 rescode = 42
    
    if(rescode==200):
        data = json.loads(response.text)
        ### 추가 ###
        _age_range = re.findall(r"\d+", data['faces'][0]['age']['value'])
        age_api = ((int(_age_range[0]) + int(_age_range[1])) / 2)
        age_conf_api = data['faces'][0]['age']['confidence']
        _gender = data['faces'][0]['gender']
        gender_api = 1 if _gender['value'] == 'female' else 0
        gender_conf_api = _gender['confidence']  
        ###########
        
        result = {
            'gender_data': line['gender'],
            'gender_api': gender_api,
            'gender_conf_api': gender_conf_api,
            'age_data': line['age'],
            'age_group_data': line['age_group'],
            'age_api': age_api,
            'age_conf_api': age_conf_api,
            'label': mask * 6 + gender * 3 + line['age_group'],
            'path': line['path'] #os.path.join(train_dir, line['path'], file),
        }
        compare_df = compare_df.append(result, ignore_index=True)
        
        # 성별 다른 경우에만 print 출력
        if gender_api != line['gender']:
            print("path: ", line["path"])
            print("age_data: ", line["age"])
            print("age_api: ", age_api)
            print("age_conf_api: ", age_conf_api)
            print("gender_data: ", line["gender"])
            print("gender_api: ", gender_api)
            print("gender_conf_api: ", gender_conf_api, '\n')
            cnt += 1

    else:
        print("Error Code:", rescode)
        print("Index:", index)
        break

print(f'end: 총 {cnt} 개')

# print 결과를 혹시 몰라 txt 파일에 복사 + 붙여넣기 해서 저장도 했음

path:  ../../input/data/train/images/000005_female_Asian_58/normal.jpg
age_data:  58
age_api:  66.0
age_conf_api:  0.489103
gender_data:  1
gender_api:  0
gender_conf_api:  0.999967 

path:  ../../input/data/train/images/000006_female_Asian_59/normal.jpg
age_data:  59
age_api:  60.0
age_conf_api:  0.415534
gender_data:  1
gender_api:  0
gender_conf_api:  0.746899 

path:  ../../input/data/train/images/000007_female_Asian_58/normal.jpg
age_data:  58
age_api:  67.0
age_conf_api:  0.645071
gender_data:  1
gender_api:  0
gender_conf_api:  0.698803 

path:  ../../input/data/train/images/000008_female_Asian_58/normal.jpg
age_data:  58
age_api:  74.0
age_conf_api:  0.288556
gender_data:  1
gender_api:  0
gender_conf_api:  0.999971 

path:  ../../input/data/train/images/000009_female_Asian_56/normal.jpg
age_data:  56
age_api:  70.0
age_conf_api:  0.308341
gender_data:  1
gender_api:  0
gender_conf_api:  0.999783 

path:  ../../input/data/train/images/000010_female_Asian_58/normal.jpg
age_data:

In [40]:
compare_df

Unnamed: 0,gender_data,gender_api,gender_conf_api,age_data,age_group_data,age_api,age_conf_api,label,path
0,1,1,0.999806,45,1,53.0,0.763184,13,../../input/data/train/images/000001_female_Asian_45/normal.jpg
1,1,1,0.508885,52,1,52.0,0.987685,13,../../input/data/train/images/000002_female_Asian_52/normal.jpg
2,0,0,0.999874,54,1,45.0,0.110434,13,../../input/data/train/images/000004_male_Asian_54/normal.jpg
3,1,0,0.999967,58,1,66.0,0.489103,13,../../input/data/train/images/000005_female_Asian_58/normal.jpg
4,1,0,0.746899,59,1,60.0,0.415534,13,../../input/data/train/images/000006_female_Asian_59/normal.jpg
...,...,...,...,...,...,...,...,...,...
295,1,0,0.994398,51,1,69.0,0.793131,13,../../input/data/train/images/000717_female_Asian_51/normal.jpg
296,1,0,0.999453,56,1,83.0,0.493178,13,../../input/data/train/images/000718_female_Asian_56/normal.jpg
297,1,0,0.540524,53,1,72.0,0.128994,13,../../input/data/train/images/000719_female_Asian_53/normal.jpg
298,1,0,0.737737,56,1,71.0,0.552067,13,../../input/data/train/images/000720_female_Asian_56/normal.jpg


In [41]:
# 시각화를 하게 될 수도 있으니 dataframe을 pickle로 저장
# 파일 이름을 해당 인덱스로 바꾸기! ***

with open( "./0000-0299_naver-api.pkl", "wb" ) as file:
    pickle.dump(compare_df, file)

In [None]:
# 불러오기 실험
# with open( "./2300-2699_naver-api.pkl", "rb" ) as file:
#     loaded_data = pickle.load(file)
# loaded_data

### no mask 데이터에 대해 저장한 dataframe 모두 합치기

In [43]:
with open( "./0000-0299_naver-api.pkl", "rb" ) as file:
    df_1 = pickle.load(file)
with open( "./0300-1299_naver-api.pkl", "rb" ) as file:
    df_2 = pickle.load(file)
with open( "./1300-2299_naver-api.pkl", "rb" ) as file:
    df_3 = pickle.load(file)
with open( "./2300-2699_naver-api.pkl", "rb" ) as file:
    df_4 = pickle.load(file)

print(df_1.shape)
print(df_2.shape)
print(df_3.shape)
print(df_4.shape)

(300, 9)
(1000, 9)
(1000, 9)
(400, 9)


In [44]:
no_mask_api = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)
no_mask_api

Unnamed: 0,gender_data,gender_api,gender_conf_api,age_data,age_group_data,age_api,age_conf_api,label,path
0,1,1,0.999806,45,1,53.0,0.763184,13,../../input/data/train/images/000001_female_Asian_45/normal.jpg
1,1,1,0.508885,52,1,52.0,0.987685,13,../../input/data/train/images/000002_female_Asian_52/normal.jpg
2,0,0,0.999874,54,1,45.0,0.110434,13,../../input/data/train/images/000004_male_Asian_54/normal.jpg
3,1,0,0.999967,58,1,66.0,0.489103,13,../../input/data/train/images/000005_female_Asian_58/normal.jpg
4,1,0,0.746899,59,1,60.0,0.415534,13,../../input/data/train/images/000006_female_Asian_59/normal.jpg
...,...,...,...,...,...,...,...,...,...
2695,0,0,0.999997,19,0,22.0,0.774581,12,../../input/data/train/images/006954_male_Asian_19/normal.jpg
2696,0,0,0.999995,19,0,19.0,0.460716,12,../../input/data/train/images/006955_male_Asian_19/normal.jpg
2697,0,0,0.999997,19,0,17.0,0.927676,12,../../input/data/train/images/006956_male_Asian_19/normal.jpg
2698,0,0,0.995121,20,0,23.0,0.550162,12,../../input/data/train/images/006957_male_Asian_20/normal.jpg


In [45]:
with open( "./no_mask_api.pkl", "wb" ) as file:
    pickle.dump(no_mask_api, file)

### path 수정 & age_group_api 추가

In [46]:
no_mask_api

Unnamed: 0,gender_data,gender_api,gender_conf_api,age_data,age_group_data,age_api,age_conf_api,label,path
0,1,1,0.999806,45,1,53.0,0.763184,13,../../input/data/train/images/000001_female_Asian_45/normal.jpg
1,1,1,0.508885,52,1,52.0,0.987685,13,../../input/data/train/images/000002_female_Asian_52/normal.jpg
2,0,0,0.999874,54,1,45.0,0.110434,13,../../input/data/train/images/000004_male_Asian_54/normal.jpg
3,1,0,0.999967,58,1,66.0,0.489103,13,../../input/data/train/images/000005_female_Asian_58/normal.jpg
4,1,0,0.746899,59,1,60.0,0.415534,13,../../input/data/train/images/000006_female_Asian_59/normal.jpg
...,...,...,...,...,...,...,...,...,...
2695,0,0,0.999997,19,0,22.0,0.774581,12,../../input/data/train/images/006954_male_Asian_19/normal.jpg
2696,0,0,0.999995,19,0,19.0,0.460716,12,../../input/data/train/images/006955_male_Asian_19/normal.jpg
2697,0,0,0.999997,19,0,17.0,0.927676,12,../../input/data/train/images/006956_male_Asian_19/normal.jpg
2698,0,0,0.995121,20,0,23.0,0.550162,12,../../input/data/train/images/006957_male_Asian_20/normal.jpg


In [48]:
def cleaning(x):
    x = x.replace("../../input/data/train/images/", "")
    return x

no_mask_api["path"] = no_mask_api["path"].apply(cleaning)
no_mask_api

Unnamed: 0,gender_data,gender_api,gender_conf_api,age_data,age_group_data,age_api,age_conf_api,label,path
0,1,1,0.999806,45,1,53.0,0.763184,13,000001_female_Asian_45/normal.jpg
1,1,1,0.508885,52,1,52.0,0.987685,13,000002_female_Asian_52/normal.jpg
2,0,0,0.999874,54,1,45.0,0.110434,13,000004_male_Asian_54/normal.jpg
3,1,0,0.999967,58,1,66.0,0.489103,13,000005_female_Asian_58/normal.jpg
4,1,0,0.746899,59,1,60.0,0.415534,13,000006_female_Asian_59/normal.jpg
...,...,...,...,...,...,...,...,...,...
2695,0,0,0.999997,19,0,22.0,0.774581,12,006954_male_Asian_19/normal.jpg
2696,0,0,0.999995,19,0,19.0,0.460716,12,006955_male_Asian_19/normal.jpg
2697,0,0,0.999997,19,0,17.0,0.927676,12,006956_male_Asian_19/normal.jpg
2698,0,0,0.995121,20,0,23.0,0.550162,12,006957_male_Asian_20/normal.jpg


In [49]:
def age_group(x):
    if x < 30:
        return 0
    elif x < 60:
        return 1
    else:
        return 2

no_mask_api["age_group_api"] = no_mask_api["age_api"].apply(age_group)
no_mask_api

Unnamed: 0,gender_data,gender_api,gender_conf_api,age_data,age_group_data,age_api,age_conf_api,label,path,age_group_api
0,1,1,0.999806,45,1,53.0,0.763184,13,000001_female_Asian_45/normal.jpg,1
1,1,1,0.508885,52,1,52.0,0.987685,13,000002_female_Asian_52/normal.jpg,1
2,0,0,0.999874,54,1,45.0,0.110434,13,000004_male_Asian_54/normal.jpg,1
3,1,0,0.999967,58,1,66.0,0.489103,13,000005_female_Asian_58/normal.jpg,2
4,1,0,0.746899,59,1,60.0,0.415534,13,000006_female_Asian_59/normal.jpg,2
...,...,...,...,...,...,...,...,...,...,...
2695,0,0,0.999997,19,0,22.0,0.774581,12,006954_male_Asian_19/normal.jpg,0
2696,0,0,0.999995,19,0,19.0,0.460716,12,006955_male_Asian_19/normal.jpg,0
2697,0,0,0.999997,19,0,17.0,0.927676,12,006956_male_Asian_19/normal.jpg,0
2698,0,0,0.995121,20,0,23.0,0.550162,12,006957_male_Asian_20/normal.jpg,0


In [50]:
no_mask_api.columns.tolist()

['gender_data',
 'gender_api',
 'gender_conf_api',
 'age_data',
 'age_group_data',
 'age_api',
 'age_conf_api',
 'label',
 'path',
 'age_group_api']

In [53]:
no_mask_api_final = no_mask_api[['gender_data', 'gender_api', 'gender_conf_api',
 'age_data', 'age_group_data', 'age_api', 'age_group_api', 'age_conf_api',
 'label', 'path',
 ]]

In [54]:
no_mask_api_final

Unnamed: 0,gender_data,gender_api,gender_conf_api,age_data,age_group_data,age_api,age_group_api,age_conf_api,label,path
0,1,1,0.999806,45,1,53.0,1,0.763184,13,000001_female_Asian_45/normal.jpg
1,1,1,0.508885,52,1,52.0,1,0.987685,13,000002_female_Asian_52/normal.jpg
2,0,0,0.999874,54,1,45.0,1,0.110434,13,000004_male_Asian_54/normal.jpg
3,1,0,0.999967,58,1,66.0,2,0.489103,13,000005_female_Asian_58/normal.jpg
4,1,0,0.746899,59,1,60.0,2,0.415534,13,000006_female_Asian_59/normal.jpg
...,...,...,...,...,...,...,...,...,...,...
2695,0,0,0.999997,19,0,22.0,0,0.774581,12,006954_male_Asian_19/normal.jpg
2696,0,0,0.999995,19,0,19.0,0,0.460716,12,006955_male_Asian_19/normal.jpg
2697,0,0,0.999997,19,0,17.0,0,0.927676,12,006956_male_Asian_19/normal.jpg
2698,0,0,0.995121,20,0,23.0,0,0.550162,12,006957_male_Asian_20/normal.jpg


In [55]:
with open( "./no_mask_api_final.pkl", "wb" ) as file:
    pickle.dump(no_mask_api_final, file)