In [1]:
# 处理标错的人脸

# 1.清洗文件夹下的数据：找出某个人文件夹下图片中明显与其他图片不一致的图
# 2.选择一张图片A作为target，其他图片B、C、D作为对照，计算该组的平均欧式距离d_A = （d(AB)+d(AC)+d(AD)）/ 3
# 3.再选择B做为target，其他图A、C、D作为对照，计算平均欧式距离d_B，以此类推
# 4.如果d_A大于阈值，则删除A
# 5.其他文件夹类似处理

In [2]:
# 计算欧式距离

In [3]:
# 导入包
import cv2
import numpy as np
import matplotlib.pyplot as plt
import dlib
# %matplotlib inline
plt.rcParams['figure.dpi'] = 200

In [4]:
# 关键点 检测模型
shape_detector = dlib.shape_predictor('./weights/shape_predictor_68_face_landmarks.dat')
# resnet模型
face_descriptor_extractor = dlib.face_recognition_model_v1('./weights/dlib_face_recognition_resnet_model_v1.dat')

In [5]:
# 提取单张图片的特征描述符
def getFaceFeat(fileName):
    
    #读取
    img=cv2.imdecode(np.fromfile(fileName,dtype=np.uint8),-1)
    if img is None:
        return None
    
    # 转为RGB
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    # 初始化
    face_descriptor = None
    # 整个裁剪图就是人脸，无需再检测
    h,w = img.shape[:2]
    l,t,r,b = 0,0,w,h
    # 构造DLIB类型
    face = dlib.rectangle(l,t,r,b) 
    # 获取关键点
    points = shape_detector(img,face)
    # 获取特征描述符
    face_descriptor = face_descriptor_extractor.compute_face_descriptor(img,points)
    # 转为numpy 格式的数组
    face_descriptor = [f for f in face_descriptor]
    face_descriptor = np.asarray(face_descriptor,dtype=np.float64)
    face_descriptor = np.reshape(face_descriptor,(1,-1))
    
    return face_descriptor

In [7]:
# 测试一张
feat_test = getFaceFeat('./chinese_faces_cleaned/阿宝/阿宝_1.jpg')

In [8]:
feat_test.shape

(1, 128)

In [9]:
import glob, tqdm
import shutil
import os

In [10]:
# 获取所有人名
person_list = glob.glob('./chinese_faces_cleaned/*')

In [11]:
person_list[8]

'./chinese_faces_cleaned\\安雅萍'

In [12]:
len(person_list)

40

In [13]:
# 遍历每个人的文件夹
for person in tqdm.tqdm(person_list):
    
    
    # 初始化特征列表，记录文件名
    feature_list = None
    record_file = []
    # 获取该人名下的所有图片
    file_name = person+'/*.jpg'
    img_list = glob.glob(file_name)
    # 遍历图片
    for img_file in img_list:
        # 获取每一张图片的特征
        feat = getFaceFeat(img_file)
        #过滤数据
        if feat is not None: 
            
            if feature_list is None:
                feature_list = feat
            else:
                # 特征列表
                feature_list = np.concatenate((feature_list,feat),axis=0)
            # 记录一下文件名
            record_file.append(img_file)
    
    if feature_list is None:
        continue      
    # 计算欧式距离
    # 依次计算一个特征描述符与所有特征的距离
    for i in range(len(feature_list)):
        dist_list = np.linalg.norm((feature_list[i]-feature_list),axis=1)
        dist_average = np.average(dist_list)

        # print(dist_average)
        #如果大于特征阈值，说明它与其他不同
        if dist_average > 0.6:
            
            remove_file = record_file[i]
            
            # 先复制到chinese_faces_mislabeled下，再在路径中删除
            person_class = person.split('\\')[-1]
            # 创建需要保存的目录
            save_dir = './chinese_faces_mislabeled/'+person_class
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            
            # 复制
            shutil.copy(remove_file, save_dir)            
            # 删除
            os.remove(remove_file)

            print('删除'+remove_file)
    
        
    # break
    
    

 12%|█▎        | 5/40 [00:09<01:08,  1.97s/it]

删除./chinese_faces_cleaned\安悦溪\安悦溪_0.jpg


 80%|████████  | 32/40 [00:43<00:10,  1.35s/it]

删除./chinese_faces_cleaned\阿宝\阿宝_0.jpg


100%|██████████| 40/40 [00:54<00:00,  1.36s/it]
