In [52]:
import os
import math

import requests
from bs4 import BeautifulSoup as bs

In [16]:
r = requests.get('http://vis-www.cs.umass.edu/lfw/number_11.html')
soup = bs(r.text, 'html.parser')
soup_images = soup.find_all('img', attrs={'alt': 'person image'})

In [17]:
persons = []
for person in soup_images:
    person_name = person['src'].split('/')[-2]
    persons.append(person_name)

In [31]:
def get_images(url):
    r = requests.get(url)
    soup = bs(r.text, 'html.parser')
    images = soup.find_all('img', attrs={'alt': lambda x: x and x.startswith('Original image')})
    image_urls = ["http://vis-www.cs.umass.edu/lfw" + i['src'][2:] for i in images]
    return image_urls

In [70]:
def download_image(url, path):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(path, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
        
def save_images(person_name, images, folder='downloads'):
    for image_url in images:
        image_name = image_url.split('/')[-1]
        image_folder = f"{folder}\\{person_name}"
        if not os.path.exists(image_folder):
            os.makedirs(image_folder)
        image_path = image_folder + f'\\{image_name}'
        download_image(image_url, image_path)

In [71]:
def split_images(images):
    total_images = len(images)
    split_val = math.floor(.8 * total_images)
    training_images = images[:split_val]
    testing_images = images[split_val:]
    return training_images, testing_images

In [None]:
base_link = "http://vis-www.cs.umass.edu/lfw/person/{}.html"
for person in persons:
    print(f"Working on {person}")
    person_link = base_link.format(person)
    images = get_images(person_link)
    print(f"Images found: {len(images)}")
    if len(images) == 0:
        continue
    train, test = split_images(images)
    save_images(person, images)
    save_images(person, train, folder='training')
    save_images(person, test, folder='testing')
print("Finished downloading images")

Working on Ann_Veneman
Images found: 11
Working on Catherine_Zeta-Jones
Images found: 11
Working on Condoleezza_Rice
Images found: 11
Working on James_Kelly
Images found: 11
Working on Jiri_Novak
Images found: 11
Working on John_Allen_Muhammad
Images found: 11
Working on John_Paul_II
Images found: 11
Working on Kim_Ryong-sung
Images found: 11
Working on Mark_Philippoussis
Images found: 11
Working on Mike_Weir
Images found: 11
Working on Nicanor_Duarte_Frutos
Images found: 11
Working on Paul_Burrell
Images found: 11
Working on Richard_Gephardt
Images found: 11
Working on Sergey_Lavrov
Images found: 11
Working on Sergio_Vieira_De_Mello
Images found: 11
Working on Tang_Jiaxuan
Images found: 11
Working on Adrien_Brody
Images found: 12
