In [1]:
import requests
import os
import tarfile
import shutil



In [2]:
# Generic function to download a file from target url and write to a designated path
def download_file(url, path, write_type = 'w', verify = False):
    if write_type not in ['w', 'wb']:
        print('Error. write_type must be w or wb')
        
        return
    
    print(f'Retrieving response from {url}')
    r = requests.get(url, verify = verify)
    
    if r.status_code == 200:
        print(f'Response successful, writing to {path}')
        
        with open(path, write_type) as f:
            if write_type == 'w':
                f.write(r.text)
            elif write_type == 'wb':
                f.write(r.content)
        
        print(f'Successfully written to {path}')
        
        return
        
    print(f'Response unsuccessful, response code: {r.status_code}')
    
    return

In [3]:
# Create File Structures
top_path = './data'
paths = ['raw', 'preprocessed', 'vectorized']
# raw_path = './data/raw'
# pre_processed = './data/preprocessed'
# vectorized = './data/vectorized'
# paths = [raw_path, pre_processed, vectorized]
partitions = ['train', 'test']

for path in paths:
    for partition in partitions:
        full_path = os.path.join(top_path, path, partition)
        if not os.path.exists(full_path):
            os.makedirs(full_path)

In [4]:
# Download and unpack dataset
lfw_url = 'http://vis-www.cs.umass.edu/lfw/lfw.tgz'
lfw_target_path = os.path.join(top_path, paths[0], 'lfw.tgz')

download_file(lfw_url, lfw_target_path, 'wb')

file = tarfile.open(lfw_target_path)
file.extractall('/'.join(lfw_target_path.split('/')[:-1]))
file.close()

Retrieving response from http://vis-www.cs.umass.edu/lfw/lfw.tgz
Response successful, writing to ./data/raw/lfw.tgz
Successfully written to ./data/raw/lfw.tgz


In [5]:
# Download train/test split instructions

train_instructions_url = 'https://vis-www.cs.umass.edu/lfw/peopleDevTrain.txt'
test_instructions_url = 'https://vis-www.cs.umass.edu/lfw/peopleDevTest.txt'
train_instructions_path = os.path.join(top_path, paths[0], 'peopleDevTrain.txt')
test_instructions_path = os.path.join(top_path, paths[0], 'peopleDevTest.txt')

urls = [train_instructions_url, test_instructions_url]
instruction_paths = [train_instructions_path, test_instructions_path]

for i, url in enumerate(urls):
    download_file(url, instruction_paths[i], 'w')

Retrieving response from https://vis-www.cs.umass.edu/lfw/peopleDevTrain.txt
Response successful, writing to ./data/raw/peopleDevTrain.txt
Successfully written to ./data/raw/peopleDevTrain.txt
Retrieving response from https://vis-www.cs.umass.edu/lfw/peopleDevTest.txt
Response successful, writing to ./data/raw/peopleDevTest.txt
Successfully written to ./data/raw/peopleDevTest.txt




In [6]:
# Move files outside of unzipped directory into train/test
for i, instruction in enumerate(instruction_paths):
    with open(instruction, 'r') as f:
        name_list = [[int(y) if y.isdigit() else y for y in x.strip().split('\t')] for x in f.readlines()[1:]]
    for pair in name_list:
        name, amount = pair
        image_dir = os.path.join(top_path, paths[0], f'lfw/{name}')
        target_dir = os.path.join(top_path, paths[0], partitions[i])
        
        for j in range(1, amount + 1):
            image_name = f'{name}_{j:04}.jpg'
            image_path = os.path.join(image_dir, image_name)
            target_path = os.path.join(target_dir, image_name)
            shutil.move(image_path, target_path)

In [7]:
# Remove empty LFW directory and .tgz file
if os.path.exists('./data/raw/lfw'):
    for directory in os.listdir('./data/raw/lfw'):
        os.rmdir(os.path.join('./data/raw/lfw', directory))
    os.rmdir('./data/raw/lfw')
    
if os.path.exists('./data/raw/lfw.tgz'):
    os.remove('./data/raw/lfw.tgz')