In [3]:
import os
import glob
import sys
import csv
import numpy as np
import random
import cv2
import math
import argparse

In [16]:
def make_dir_if_not_exist(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [40]:
def parse_UMD(image_dir, anno_dir, train_size, test_size):
    anno_file = os.path.join(anno_dir, 'umdfaces_batch3_ultraface.csv')
    dataset = []
    
    num_train_test_size = train_size + test_size 
    
    count = 0
    with open(anno_file) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        next(readCSV)
        for row in readCSV:
            per_data_info = []
            
            img_name = row[1]
            face_x = row[4]
            face_y = row[5]
            face_width = row[6]
            face_height = row[7]
            
            per_data_info.append(img_name)
            per_data_info.append(face_x)
            per_data_info.append(face_y)
            per_data_info.append(face_width)
            per_data_info.append(face_height)
            
            if count % 500 == 0:
                print(count)
            
            count += 1
            dataset.append(per_data_info)
            if(count < num_train_test_size):
                continue
            break
            
    
    csvfile.close()
    return dataset

In [41]:
# cначала ответами для изображений были координаты левой верхней вершины ограничительной рамки и 
# ширина и выcота этой рамки (функция generate_images_and_csv), но позже было принято решение заменить 
# ширину и высоту на координаты правой нижней вершины ограничительной рамки (функция new_csv), тем самым 
# мы добились большей однородности ответов (в данном случае под однородностью понимается то, 
# что все компоненты ответов – части координат некоторого пикселя)
 
def generate_images_and_csv (image_dir, save_images_dir, save_csv, dataset, start_idx, data_size):
    csv_data = [['id', 'x', 'y', 'w', 'h']]

    for i in range(start_idx, start_idx + data_size):
        per_data_info = dataset[i]
        full_name = per_data_info[0]
        index_of_slash = full_name.find('/')
        img_name = full_name[index_of_slash+1:]
        folder_name = img_name[:index_of_slash]
        x = int(float(per_data_info[1]))
        y = int(float(per_data_info[2]))
        w = int(float(per_data_info[3]))
        h = int(float(per_data_info[4]))
            
        folder_path = os.path.join(image_dir, folder_name)

        for filename in glob.glob(folder_path + '\*.jpg'):
            length = len(img_name)
            filename_short=filename[len(filename)-length:]
                
            if(filename_short == img_name):
                img = cv2.imread(folder_path + '\\' + img_name)
                img_height, img_width = img.shape[:2]
            
                x = int(x * 256 / img_width) 
                y = int(y * 256 / img_height)
                w = int(w * 256 / img_width)
                h = int(h * 256 / img_height)
                
                cv2.imwrite(save_images_dir + '\\' + img_name, img)
                
                row = [img_name, x, y, w, h]
                csv_data.append(row)

    with open(save_csv, 'w', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerows(csv_data)
        
    csvFile.close()


def create_datasets_UMD(image_dir, anno_dir, save_dir, 
                        train_size=20_000, test_size=2_000): #16_000 + 4_000
    
    dataset = parse_UMD(image_dir, anno_dir, train_size, test_size)
    
    make_dir_if_not_exist(save_dir)
    save_folder_train = os.path.join(save_dir, 'train')
    make_dir_if_not_exist(save_folder_train)
    save_folder_test = os.path.join(save_dir, 'test')
    make_dir_if_not_exist(save_folder_test)
    
    save_train_csv = r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\trainLabels.csv'
    save_test_csv = r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\testLabels.csv'
    generate_images_and_csv(image_dir, save_folder_train, save_train_csv, dataset, 0, train_size)
    generate_images_and_csv(image_dir, save_folder_test, save_test_csv, dataset, train_size, test_size)

In [42]:
if __name__ == '__main__':
    image_dir = r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\original_pics'
    save_dir = r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\train_test'
    anno_dir = r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\Annotation'
    
    create_datasets_UMD(image_dir, anno_dir, save_dir)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500


In [9]:
def new_csv(csv_dir, new_csv_dir):
    csvfile = open(csv_dir) 
    new_csv = open(new_csv_dir, 'w', newline='')
    
    reader = csv.reader(csvfile, delimiter=',')
    writer = csv.writer(new_csv)
    
    next(reader)
    writer.writerow(['id', 'x1', 'y1', 'x2', 'y2'])
    for row in reader:
        img_name = row[0]
        x = int(row[1])
        y = int(row[2])
        w = int(row[3])
        h = int(row[4])
        writer.writerow([img_name, str(x), str(y), str(x + w), str(y + h)])
        
    csvfile.close()
    new_csv.close()
            
new_csv(r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\train_test\testLabels.csv', 
       r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\train_test\test.csv')

In [10]:
new_csv(r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\train_test\trainLabels.csv', 
       r'D:\jupyter_projects\Face Detection\data\umdfaces_batch3\data_UMD\train_test\train.csv')