IMDB Gender images:

3.	Suppose we want to build a data set for a Computer vision task that involves gender images. 
4.	Your tasks are the following:

    a.	Collect 10k male/female images from:
        https://www.imdb.com

        ■	Make sure to render the whole page using selenium and then use BeautifulSoup  to scrape the images

        ■	Create a folder for male/female

        ■	Each image will be named after the person in the picture


In [1]:
from bs4 import BeautifulSoup
import requests
import random
import time
from datetime import datetime
import os
from PIL import Image
from io import BytesIO
from lxml import etree
import warnings
warnings.filterwarnings("ignore")

class IMDbScraper:
    def __init__(self):
        self.male_path = "./Pics/Male"
        self.female_path = "./Pics/Female"
        self.base_xpath_address = '//*[@id="main"]/div/div[3]'
        self.gender = ["male", "female"]
        self.start_Time = datetime.now()
        
    def create_folders(self):
        if not os.path.exists("./Pics"):
            os.mkdir("./Pics")
            print("Pics Folder Created")
        else:
            print("Pics Folder Already Exist")
        
        if not os.path.exists(self.male_path):
            os.mkdir(self.male_path)
            print("Male Folder Created")
        else:
            print("Male Folder Already Exist")
            
        if not os.path.exists(self.female_path):
            os.mkdir(self.female_path)
            print("Female Folder Created")
        else:
            print("Female Folder Already Exist")
    
    def scrape_data(self):
        """
        Scrape IMDb data for male and female genders, download images, and save them in respective folders.
        """
        warnings.filterwarnings("ignore")
        
        for g in self.gender:
            if g == "male":
                path = self.male_path
            elif g == "female":
                path = self.female_path
            
            """
            Each page can have a maximum of 250 people. I load the maximum number per page. 
            Since I need 10,000 images for maleand 10,000 for female, we go up to 40 pages to download these images.
            """
            for n in range(1, 41):
                page_url = f'https://www.imdb.com/search/name/?gender={g}&count=250&start={((n-1)*250)+1}&ref_=rlm'
                page = requests.get(page_url)
                soup = BeautifulSoup(page.text, "html.parser")
                tree = etree.HTML(str(soup))
                time.sleep(random.randint(3, 7))

                lst_Name = []
                for i in range(1, 251):
                    xpath_address_Name = f'{self.base_xpath_address}/div[{i}]/div[2]/h3/a'
                    lst_Name.append(tree.xpath(xpath_address_Name)[0].text)

                for i in range(1, 251):
                    xpath_address_img = f'{self.base_xpath_address}/div[{i}]/div[1]/a/img'
                    image_element = tree.xpath(xpath_address_img)

                    if image_element:
                        image_url = image_element[0].get('src')
                        image_response = requests.get(image_url)
                        image = Image.open(BytesIO(image_response.content))

                        if image_url[-3:] == 'jpg':
                            file_path = f"{path}/{lst_Name[i-1].strip()}.jpg"
                        else:
                            file_path = f"{path}/{lst_Name[i-1].strip()}.png"

                        image.save(file_path)

    def run(self):
        """
        Run the IMDbScraper class by creating folders and scraping data.
        """
        
        ss = datetime.now()
        self.create_folders()
        self.scrape_data()
        print("Time Length to Scraping Data was : ",datetime.now() - self.start_Time)

scraper = IMDbScraper()
scraper.run()

Pics Folder Already Exist
Male Folder Already Exist
Female Folder Already Exist
Time Length to Scraping Data was :  0:00:47.205536
