In [2]:
import os
import math

import requests
from bs4 import BeautifulSoup as bs

In [3]:
r = requests.get('http://vis-www.cs.umass.edu/lfw/number_11.html')
soup = bs(r.text, 'html.parser')
soup_images = soup.find_all('img', attrs={'alt': 'person image'})

In [4]:
persons = []
for person in soup_images:
    person_name = person['src'].split('/')[-2]
    persons.append(person_name)

In [5]:
def get_images(url):
    r = requests.get(url)
    soup = bs(r.text, 'html.parser')
    images = soup.find_all('img', attrs={'alt': lambda x: x and x.startswith('Original image')})
    image_urls = ["http://vis-www.cs.umass.edu/lfw" + i['src'][2:] for i in images]
    return image_urls

In [6]:
def download_image(url, path):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(path, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
        
def save_images(person_name, images, folder='downloads'):
    for image_url in images:
        image_name = image_url.split('/')[-1]
        image_folder = f"{folder}\\{person_name}"
        if not os.path.exists(image_folder):
            os.makedirs(image_folder)
        image_path = image_folder + f'\\{image_name}'
        download_image(image_url, image_path)

In [7]:
def split_images(images):
    total_images = len(images)
    split_val = math.floor(.8 * total_images)
    training_images = images[:split_val]
    testing_images = images[split_val:]
    return training_images, testing_images

In [12]:
base_link = "http://vis-www.cs.umass.edu/lfw/person/{}.html"
for person in persons:
    person_link = base_link.format(person)
    images = get_images(person_link)
    if 100 > len(images) > 30:
        print(f"Working on {person}")
        print(f"Images found: {len(images)}")
        train, test = split_images(images)
        save_images(person, images)
        save_images(person, train, folder='training')
        save_images(person, test, folder='testing')
print("Finished downloading images")

Working on David_Beckham
Images found: 31
Working on John_Negroponte
Images found: 31
Working on Kofi_Annan
Images found: 32
Working on Roh_Moo-hyun
Images found: 32
Working on Vicente_Fox
Images found: 32
Working on Megawati_Sukarnoputri
Images found: 33
Working on Silvio_Berlusconi
Images found: 33
Working on Tom_Ridge
Images found: 33
Working on Alvaro_Uribe
Images found: 35
Working on Andre_Agassi
Images found: 36
Working on Nestor_Kirchner
Images found: 37
Working on Alejandro_Toledo
Images found: 39
Working on Hans_Blix
Images found: 39
Working on Laura_Bush
Images found: 41
Working on Lleyton_Hewitt
Images found: 41
Working on Arnold_Schwarzenegger
Images found: 42
Working on Jennifer_Capriati
Images found: 42
Working on Gloria_Macapagal_Arroyo
Images found: 44
Working on Luiz_Inacio_Lula_da_Silva
Images found: 48
Working on Vladimir_Putin
Images found: 49
Working on Jacques_Chirac
Images found: 52
Working on Serena_Williams
Images found: 52
Working on John_Ashcroft
Images found