In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import cv2
import os
import re

## 说明：
本程序用于提取与保存人脸数据集的标签，并对人脸数据集的各种标签进行统计

读入训练集和测试集标签

In [2]:
#face_DR: 训练集标签
with open("data/faceDR", 'r') as f:
    face_DR = f.read()
print(face_DR[1:74]) #显示训练集标签中的前74个字，即第一行

1223 (_sex  male) (_age  child) (_race white) (_face smiling) (_prop '())


In [4]:
#face_DS：测试集标签
with open("data/faceDS", 'r') as f:
    face_DS = f.read()

### 提取数据集标签并将其数字化。

在训练集中有无法使用的图片，故应该将这些图片对应的标签从训练集标签中去除。

在训练集标签中有“miss descriptior”，即标签缺失。由于缺失样本数很少（只有两个），并且该缺失针对所有类型的标签，故直接将缺失去样本从数据集中删除。

所用标签为：1.性别，分为female和male两类；2.年龄，分为child, teen, aldult, senior共四类

#### 训练集标签处理

将训练集标签整理为DataFrame格式

In [6]:
face_DR_num = re.findall("\d+(?=\s\(_sex)", face_DR)    #可用图片标号
face_DR_wrong = re.findall("\d+(?=\s\(_miss)", face_DR)    #缺失标签值的图片标号

for index, num in enumerate(face_DR_num):
    face_DR_num[index]=int(face_DR_num[index])

for index, wrong in enumerate(face_DR_wrong):
    face_DR_wrong[index]=int(wrong)

face_DR_sex = re.findall("(?<=sex\s\s)\w+", face_DR)    #有效性别标签
face_DR_age = re.findall("(?<=age\s\s)\w+", face_DR)    #有效年龄标签
face_DR_race = re.findall("(?<=race\s)\w+", face_DR)    #有效种族标签
face_DR_face = re.findall("(?<=face\s)\w+", face_DR)    #有效情感标签
# face_DR_prop = re.findall("(?<=_prop\s\'\()\w+|\)", face_DR)    #有效特殊特征标签
# for index, prop in enumerate(face_DR_prop):
#     if prop == ')':
#         face_DR_prop[index] = ''

# print(face_DR_sex[0])
# print(face_DR_age[0])
# print(face_DR_race[0])
# print(face_DR_face[0])
# print('face_DR_prop =', len(face_DR_prop))
  
print("原始数据标签个数 =", len(face_DR_num)+len(face_DR_wrong))
#将face_DR_num中含有的噪声标签去除（放入face_DR_wrong中），并对两个列表中的字符串数值化

   
face_DR_data = {'sex':face_DR_sex, 'age':face_DR_age, 'race':face_DR_race, 'face':face_DR_face}
face_DR_dataframe = pd.DataFrame(data = face_DR_data, index = face_DR_num)


#经测试发现，2412号、2416图片格式有问题，无法读取
face_DR_dataframe.drop([2412,2416], axis=0, inplace=True)
face_DR_wrong.extend([2412,2416])
face_DR_num.remove(2412)
face_DR_num.remove(2416)
print(face_DR_dataframe)
print(face_DR_dataframe.describe())


print('wrong =',face_DR_wrong)
        
print("有效标签face_DR_num数量为：", len(face_DR_num))
print("噪声标签face_DR_wrong数量为：", len(face_DR_wrong))
print("噪声样本序号：", face_DR_wrong)

# print(face_DR_num)

原始数据标签个数 = 2000
         sex     age   race     face
1223    male   child  white  smiling
1224    male   child  white  serious
1225    male   child  white  smiling
1226    male   child  white  smiling
1227    male   child  white  serious
...      ...     ...    ...      ...
3218  female  senior  white  serious
3219  female  senior  white  smiling
3220  female   adult  asian  serious
3221  female  senior  white  smiling
3222  female  senior  black  serious

[1995 rows x 4 columns]
         sex    age   race     face
count   1995   1995   1995     1995
unique     2      4      5        3
top     male  adult  white  smiling
freq    1148   1436   1838     1043
wrong = [1228, 1232, 1808, 2412, 2416]
有效标签face_DR_num数量为： 1995
噪声标签face_DR_wrong数量为： 5
噪声样本序号： [1228, 1232, 1808, 2412, 2416]


#### 训练集标签统计

In [7]:
DR_sex_count = face_DR_dataframe['sex'].value_counts()
print(DR_sex_count,'\n')
DR_age_count = face_DR_dataframe['age'].value_counts()
print(DR_age_count,'\n')
DR_race_count = face_DR_dataframe['race'].value_counts()
print(DR_race_count,'\n')
DR_face_count = face_DR_dataframe['face'].value_counts()
print(DR_face_count,'\n')

male      1148
female     847
Name: sex, dtype: int64 

adult     1436
teen       261
child      244
senior      54
Name: age, dtype: int64 

white       1838
black         64
asian         55
hispanic      26
other         12
Name: race, dtype: int64 

smiling    1043
serious     915
funny        37
Name: face, dtype: int64 



从DataFrame中导出训练集性别标签、年龄标签和情感标签（ndarray类型），共1995个数据；通过np.save与np.load将Series格式的标签转换为矩阵格式

DR_sex.npy为字符串标签，DR_sex_map.npy为二值化后的性别标签(male = 1, female = 0)

DR_age.npy为字符串标签，DR_age_map.npy为数值化后的年龄标签(child = 0, teen = 1, adult = 2, senior = 3)

DR_face.npy为字符串标签，DR_face_map.npy为数值化后的年龄标签(smiling = 0, serious = 1, funny = 2)

In [8]:
DR_sex = face_DR_dataframe['sex']
print(DR_sex[1:5])
np.save('data/DR_sex.npy',DR_sex)

#数值化处理
face_DR_sex_map = pd.Series(face_DR_dataframe['sex'], index = face_DR_num)
DR_sex_map = face_DR_sex_map.map({'male':1, 'female':0})
print(DR_sex_map[1:5])
np.save('data/DR_sex_map.npy', DR_sex_map)

1224    male
1225    male
1226    male
1227    male
Name: sex, dtype: object
1224    1
1225    1
1226    1
1227    1
Name: sex, dtype: int64


In [9]:
DR_age = face_DR_dataframe['age']
print(DR_age[1:5])
np.save('data/DR_age.npy',DR_age)

#数值化处理
face_DR_age_map = pd.Series(face_DR_dataframe['age'], index = face_DR_num)
DR_age_map = face_DR_age_map.map({'child':0, 'teen':1, 'adult':2, 'senior':3})
print(DR_age_map[1:5])
np.save('data/DR_age_map.npy', DR_age_map)

1224    child
1225    child
1226    child
1227    child
Name: age, dtype: object
1224    0
1225    0
1226    0
1227    0
Name: age, dtype: int64


In [10]:
DR_face = face_DR_dataframe['face']
print(DR_face[1:5])
np.save('data/DR_face.npy',DR_face)

#数值化处理
face_DR_face_map = pd.Series(face_DR_dataframe['face'], index = face_DR_num)
DR_face_map = face_DR_face_map.map({'smiling':0, 'serious':1, 'funny':2})
print(DR_face_map[1:5])
np.save('data/DR_face_map.npy', DR_face_map)

1224    serious
1225    smiling
1226    smiling
1227    serious
Name: face, dtype: object
1224    1
1225    0
1226    0
1227    1
Name: face, dtype: int64


#### 测试集标签处理

将测试集DS转换为DataFrame格式

In [11]:
face_DS_num = re.findall("\d+(?=\s\(_sex)", face_DS)    #可用图片标号
face_DS_wrong = re.findall("\d+(?=\s\(_miss)", face_DS)    #缺失标签值的图片标号

for index, num in enumerate(face_DS_num):
    face_DS_num[index]=int(face_DS_num[index])

for index, wrong in enumerate(face_DS_wrong):
    face_DS_wrong[index]=int(wrong)


face_DS_sex = re.findall("(?<=sex\s\s)\w+", face_DS)    #有效性别标签
face_DS_age = re.findall("(?<=age\s\s)\w+", face_DS)    #有效年龄标签
face_DS_race = re.findall("(?<=race\s)\w+", face_DS)    #有效种族标签
face_DS_face = re.findall("(?<=face\s)\w+", face_DS)    #有效情感标签
    
print("原始数据标签个数 =", len(face_DS_num)+len(face_DS_wrong),'\n')
    
face_DS_data = {'sex':face_DS_sex, 'age':face_DS_age, 'race':face_DS_race, 'face':face_DS_face}
face_DS_dataframe = pd.DataFrame(data = face_DS_data, index = face_DS_num)
print('测试集标签如下：','\n',face_DS_dataframe,'\n')
print('测试集标签统计如下：','\n',face_DS_dataframe.describe(),'\n')
print('wrong =',face_DS_wrong)      
print("有效标签face_DR_num数量为：", len(face_DS_num))
print("噪声标签face_DR_wrong数量为：", len(face_DS_wrong))
print("噪声样本序号：", face_DS_wrong)

# print(face_DS_num)

原始数据标签个数 = 2000 

测试集标签如下： 
          sex     age   race     face
3223  female  senior  black  smiling
3224  female   adult  black  smiling
3225    male  senior  white  smiling
3226    male  senior  white  serious
3227    male  senior  white  serious
...      ...     ...    ...      ...
5218  female   adult  white  smiling
5219  female   adult  white  smiling
5220  female   adult  white  serious
5221  female   adult  white  smiling
5222  female   adult  white  smiling

[1996 rows x 4 columns] 

测试集标签统计如下： 
          sex    age   race     face
count   1996   1996   1996     1996
unique     2      4      5        3
top     male  adult  white  serious
freq    1277   1730   1699     1097 

wrong = [4056, 4135, 4136, 5004]
有效标签face_DR_num数量为： 1996
噪声标签face_DR_wrong数量为： 4
噪声样本序号： [4056, 4135, 4136, 5004]


#### 测试集标签统计

In [13]:
DS_sex_count = face_DS_dataframe['sex'].value_counts()
print(DS_sex_count,'\n')
DS_age_count = face_DS_dataframe['age'].value_counts()
print(DS_age_count,'\n')
DS_race_count = face_DS_dataframe['race'].value_counts()
print(DS_race_count,'\n')
DS_face_count = face_DS_dataframe['face'].value_counts()
print(DS_face_count,'\n')

male      1277
female     719
Name: sex, dtype: int64 

adult     1730
senior     115
teen        83
child       68
Name: age, dtype: int64 

white       1699
black        249
asian         26
hispanic      13
other          9
Name: race, dtype: int64 

serious    1097
smiling     836
funny        63
Name: face, dtype: int64 



导出测试集性别标签、年龄标签和情感标签（ndarray类型），共1996个数据；通过np.save与np.load将Series格式的标签转换为矩阵格式

In [12]:
DS_sex = face_DS_dataframe['sex']
print(DS_sex[1:5])
np.save('data/DS_sex.npy', np.array(DS_sex))

#数值化处理
face_DS_sex_map = pd.Series(face_DS_dataframe['sex'], index = face_DS_num)
DS_sex_map = face_DS_sex_map.map({'male':1, 'female':0})
print(DS_sex_map[1:5])
np.save('data/DS_sex_map.npy', DS_sex_map)

3224    female
3225      male
3226      male
3227      male
Name: sex, dtype: object
3224    0
3225    1
3226    1
3227    1
Name: sex, dtype: int64


In [13]:
DS_age = face_DS_dataframe['age']
print(DS_age[1:5])
np.save('data/DS_age.npy',DS_age)

#数值化处理
face_DS_age_map = pd.Series(face_DS_dataframe['age'], index = face_DS_num)
DS_age_map = face_DS_age_map.map({'child':0, 'teen':1, 'adult':2, 'senior':3})
print(DS_age_map[1:5])
np.save('data/DS_age_map.npy', DS_age_map)

3224     adult
3225    senior
3226    senior
3227    senior
Name: age, dtype: object
3224    2
3225    3
3226    3
3227    3
Name: age, dtype: int64


In [14]:
DS_face = face_DS_dataframe['face']
print(DS_face[1:5])
np.save('data/DS_face.npy',DS_face)

#数值化处理
face_DS_face_map = pd.Series(face_DS_dataframe['face'], index = face_DS_num)
DS_face_map = face_DS_face_map.map({'smiling':0, 'serious':1, 'funny':2})
print(DS_face_map[1:5])
np.save('data/DS_face_map.npy', DS_face_map)

3224    smiling
3225    smiling
3226    serious
3227    serious
Name: face, dtype: object
3224    0
3225    0
3226    1
3227    1
Name: face, dtype: int64


导入特征提取后的训练集trainDistance和测试集testDistance

导入训练集标签和测试集标签

In [15]:
trainDistance = np.load("data/trainDistance.npy")
testDistance = np.load("data/testDistance.npy")
DS_sex_map = np.load("data/DS_sex_map.npy")
DR_sex_map = np.load("data/DR_sex_map.npy")
DS_age_map = np.load("data/DS_age_map.npy")
DR_age_map = np.load("data/DR_age_map.npy")
print("data/DR_sex_map: ", DR_sex_map)
print("data/DS_sex_map: ", DS_sex_map)
print("data/DR_age_map: ", DR_age_map)
print("data/DS_age_map: ", DS_age_map)

data/DR_sex_map:  [1 1 1 ... 0 0 0]
data/DS_sex_map:  [0 0 1 ... 0 0 0]
data/DR_age_map:  [0 0 0 ... 2 3 3]
data/DS_age_map:  [3 2 3 ... 2 2 2]
