In [1]:
import os
import pandas as pd
import zipfile
import requests
from tqdm import tqdm
import numpy as np

In [2]:
dirpath = '../data'
celebA_dir = os.path.join(dirpath, 'CelebA')
if not os.path.exists(celebA_dir):
    os.makedirs(celebA_dir)

# Dataset
We currently have only one option: CelebA. It can be downloaded as .zip archive from here: https://www.kaggle.com/jessicali9530/celeba-dataset.

Dataset should be saved as 'data/CelebA/img_align_celeba.zip'.

## Download CelebA dataset's labels

## Support functions

In [3]:
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()

    response = session.get(URL, params={'id': id}, stream=True)
    token = get_confirm_token(response)

    if token:
        params = {'id': id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)

    save_response_content(response, destination)

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

def save_response_content(response, destination, chunk_size=32 * 1024):
    total_size = int(response.headers.get('content-length', 0))
    with open(destination, "wb") as f:
        for chunk in tqdm(response.iter_content(chunk_size), total=total_size,
                          unit='B', unit_scale=True, desc=destination):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)

## Download dataset's labels

In [4]:
celebA_dir = os.path.join(dirpath, 'CelebA')
if not os.path.exists(celebA_dir):
    os.makedirs(celebA_dir)

In [5]:
txt_name, txt_drive_id = "list_attr_celeba.txt", "0B7EVK8r0v71pblRyaVFSWGxPY0U"
txt_save_path = os.path.join(celebA_dir, txt_name)

if os.path.exists(txt_save_path):
    print(f'[*] {txt_save_path} already exists')
else:
    download_file_from_google_drive(txt_drive_id, txt_save_path)

[*] ../data/CelebA/list_attr_celeba.txt already exists


## Unzip an archive with dataset:

In [33]:
with zipfile.ZipFile(os.path.join(celebA_dir, 'img_align_celeba.zip')) as zf:
    zf.extractall(celebA_dir)

os.rename(os.path.join(celebA_dir, 'img_align_celeba'), os.path.join(celebA_dir, 'images'))

## Extract labels from .txt file

In [6]:
def read_data_file(file_path, image_dir=''):
    attr_list = {}
    
    file = open(file_path,'r')
    
    n = file.readline()
    n = int(n.split('\n')[0]) #  Number of images
    
    attr_line = file.readline()
    attr_names = attr_line.split('\n')[0].split() # attribute name
    
    for line in file:
        row = line.split('\n')[0].split()
        img_name = os.path.join(image_dir, row.pop(0))
        try:
            row = [float(val) for val in row]
        except:
            print(line)
            img_name = img_name + ' ' + row[0]
            row.pop(0)
            row = [float(val) for val in row]
            
        attr_list[img_name] = row
    
    file.close()
    
    return attr_names, attr_list

categories, file_names_dict = read_data_file(txt_save_path)
categories = np.asarray(categories).ravel()
print(categories)

['5_o_Clock_Shadow' 'Arched_Eyebrows' 'Attractive' 'Bags_Under_Eyes'
 'Bald' 'Bangs' 'Big_Lips' 'Big_Nose' 'Black_Hair' 'Blond_Hair' 'Blurry'
 'Brown_Hair' 'Bushy_Eyebrows' 'Chubby' 'Double_Chin' 'Eyeglasses'
 'Goatee' 'Gray_Hair' 'Heavy_Makeup' 'High_Cheekbones' 'Male'
 'Mouth_Slightly_Open' 'Mustache' 'Narrow_Eyes' 'No_Beard' 'Oval_Face'
 'Pale_Skin' 'Pointy_Nose' 'Receding_Hairline' 'Rosy_Cheeks' 'Sideburns'
 'Smiling' 'Straight_Hair' 'Wavy_Hair' 'Wearing_Earrings' 'Wearing_Hat'
 'Wearing_Lipstick' 'Wearing_Necklace' 'Wearing_Necktie' 'Young']


In [7]:
print("Number of images: ", len(file_names_dict.keys()))
print("Few image names:", list(file_names_dict.keys())[0 : 5])

Number of images:  202599
Few image names: ['000001.jpg', '000002.jpg', '000003.jpg', '000004.jpg', '000005.jpg']


## Create Binary-Classification Data file for Young attribute

In [8]:
#   Convert the dictionary: attr_list to a dataframe
df = pd.DataFrame(file_names_dict).T
df['Path'] = df.index
df.replace(to_replace=-1.0, value=0.0, inplace=True)
print(df.shape)
df.head(2)

(202599, 41)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,Path
000001.jpg,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,000001.jpg
000002.jpg,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,000002.jpg


In [9]:
#   Target attributes for binary classification
attributes = ['Young']
index_main = []
for a in attributes:
    index = np.where(np.asarray(categories) == a)
    index = index[0][0]
    index_main.append(index)

for i in range(len(index_main)):
    df.rename(columns={index_main[i] : attributes[i]}, inplace=True)
    
print(df.head())

              0    1    2    3    4    5    6    7    8    9  ...   31   32  \
000001.jpg  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  1.0  1.0   
000002.jpg  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  ...  1.0  0.0   
000003.jpg  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0   
000004.jpg  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0   
000005.jpg  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0   

             33   34   35   36   37   38  Young        Path  
000001.jpg  0.0  1.0  0.0  1.0  0.0  0.0    1.0  000001.jpg  
000002.jpg  0.0  0.0  0.0  0.0  0.0  0.0    1.0  000002.jpg  
000003.jpg  1.0  0.0  0.0  0.0  0.0  0.0    1.0  000003.jpg  
000004.jpg  0.0  1.0  0.0  1.0  1.0  0.0    1.0  000004.jpg  
000005.jpg  0.0  0.0  0.0  1.0  0.0  0.0    1.0  000005.jpg  

[5 rows x 41 columns]


## Write the label file for target attribute binary classification

In [47]:
columns = ['Path'] + attributes
df = df[columns]
dataset_name = 'Young'
filename =  dataset_name + '_binary_classification.txt'
df.to_csv(os.path.join(celebA_dir, filename), index=False)

# Load processed file

In [49]:
df = pd.read_csv(os.path.join(celebA_dir, filename))
print(df.shape)
df.head()

(202599, 2)


Unnamed: 0,Path,Young
0,000001.jpg,1.0
1,000002.jpg,1.0
2,000003.jpg,1.0
3,000004.jpg,1.0
4,000005.jpg,1.0
