In [1]:
import re
import time
import pandas as pd
from tqdm import tqdm
from random import random

import os
import requests
from bs4 import BeautifulSoup
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

In [2]:
films_per_category = 350
films_per_page = 50
categories = [
    "action",
    "adventure",
    "animation",
    "biography",
    "comedy",
    "crime",
    "documentary",
    "drama",
    "family",
    "fantasy",
    "film_noir",
    "history",
    "horror",
    "music",
    "musical",
    "mystery",
    "romance",
    "sci_fi",
    "short",
    "sport",
    "superhero",
    "thriller",
    "war",
    "western"
    ]

title_root = 'https://www.imdb.com/title/'
genre_root = 'https://www.imdb.com/search/title/?genres='
superhero_root = 'https://www.imdb.com/search/keyword/?keywords=superhero&title_type=movie&page='

In [3]:
path_to_df = "./data/raw_data.csv"
path_to_images = "./data/images/"

In [5]:
def get_html(url):
    time.sleep(random() * 3)
    response = session.get(url, headers= {"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11", 'Accept-Language': 'en'})
    return BeautifulSoup(response.content, 'html.parser')

def get_links_to_films(html):
    divs = html.findAll("div", {"class": "lister-item-image"})
    a_sub_tags = [div.findAll("a", href=True) for div in divs]
    a_tags = [a_tag for nested_list in a_sub_tags for a_tag in nested_list]
    hrefs = [tag["href"] for tag in a_tags]
    filtered_hrefs = [href for href in hrefs if re.compile("^\/title\/tt.*").match(href)]
    title_ids = [re.compile("tt\d*").search(id).group(0) for id in filtered_hrefs]
    return title_ids

In [6]:
def parse_genre_ids(genre):
    unique_ids = []
    for i in tqdm(range(1, films_per_category, films_per_page)):
        # Superhero is more tag than genre
        if genre != "superhero":
            url = genre_root + genre.lower() + "&start=" + str(i)
        else:
            url = superhero_root + str(i // films_per_page + 1)
        html = get_html(url)
        unique_ids.extend(get_links_to_films(html))

    return unique_ids

def parse_film_page(url):
    html = get_html(url)
    while not str(html.find("h1").contents[0]):
        html = get_html(url)
    title = str(html.find("h1", {"class": "sc-b73cd867-0"}).contents[0])
    poster_url = str(html.find("img", {"class": "ipc-image"})["src"]) if html.find("img",
                                                                                    {"class": "ipc-image"}) else ""
    synopsis = html.find("span", {"role": "presentation", "class": "sc-16ede01-2"})
    synopsis = str(synopsis.contents[0].string) if synopsis and len(
        synopsis.contents) else ""
    genre = " ".join([item.contents[0] for item in html.findAll("span", {"class": "ipc-chip__text"})])
    return title, poster_url, synopsis, genre

def parse_genre(genre):
    title_list = []
    poster_url_list = []
    synopsis_list = []
    genre_list = []
    poster_location = []

    unique_ids = parse_genre_ids(genre)
    print(f"Collected number films id: {len(unique_ids)}")
    for id in tqdm(unique_ids):
        url = title_root + id
        title, poster_url, description, labels = parse_film_page(url)

        title_list.append(title)
        poster_url_list.append(poster_url)
        synopsis_list.append(description)
        genre_list.append(labels)

        if poster_url:
            poster_response = session.get(poster_url)
            if poster_response.status_code == 200:
                path_to_save = path_to_images + genre + "/" + re.sub("\\W", "_", title.casefold()) + ".jpg"
                os.makedirs(os.path.dirname(path_to_save), exist_ok=True)
                with open(path_to_save, 'w+b') as f:
                    f.write(poster_response.content)
                    poster_location.append(path_to_save)
            else:
                print(f"{poster_response}. Urs: {url}")
        else:
            poster_location.append(' ')

    return title_list, poster_url_list, synopsis_list, genre_list, poster_location

In [None]:
session = requests.Session()
retry = Retry(connect=5, backoff_factor=1)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [8]:
title_list = []
genre_list = []
synopsis_list = []
poster_url_list = []
poster_location_list = []
with_sub_genre_list = []

for genre in tqdm(categories):
    print(f"Process genre: {genre}")
    genre_title_list, genre_poster_url_list, genre_synopsis_list, genre_cats_list, genre_poster_location = parse_genre(genre)
    title_list.extend(genre_title_list)
    poster_url_list.extend(genre_poster_url_list)
    synopsis_list.extend(genre_synopsis_list)
    genre_list.extend([genre] * len(genre_cats_list))
    poster_location_list.extend(genre_poster_location)
    with_sub_genre_list.extend(genre_cats_list)


df = pd.DataFrame({
                "title": title_list,
                "poster_url": poster_url_list,
                "synopsis": synopsis_list,
                "genre": genre_list,
                "poster_location": poster_location_list
                })

df.to_csv(path_to_df, index=False)


  0%|          | 0/16 [00:00<?, ?it/s]

Process genre: family


100%|██████████| 7/7 [00:19<00:00,  2.83s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:18<00:00,  3.31s/it]
  6%|▋         | 1/16 [19:38<4:54:37, 1178.52s/it]

Process genre: fantasy


100%|██████████| 7/7 [00:17<00:00,  2.54s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:25<00:00,  3.33s/it]
 12%|█▎        | 2/16 [39:22<4:35:40, 1181.48s/it]

Process genre: film_noir


100%|██████████| 7/7 [00:21<00:00,  3.03s/it]


Collected number films id: 350


100%|██████████| 350/350 [20:11<00:00,  3.46s/it]
 19%|█▉        | 3/16 [59:55<4:21:05, 1205.07s/it]

Process genre: history


100%|██████████| 7/7 [00:22<00:00,  3.28s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:44<00:00,  3.39s/it]
 25%|██▌       | 4/16 [1:20:02<4:01:13, 1206.13s/it]

Process genre: horror


100%|██████████| 7/7 [00:22<00:00,  3.22s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:44<00:00,  3.38s/it]
 31%|███▏      | 5/16 [1:40:09<3:41:09, 1206.35s/it]

Process genre: music


100%|██████████| 7/7 [00:18<00:00,  2.70s/it]


Collected number films id: 350


100%|██████████| 350/350 [20:32<00:00,  3.52s/it]
 38%|███▊      | 6/16 [2:01:00<3:23:36, 1221.61s/it]

Process genre: musical


100%|██████████| 7/7 [00:20<00:00,  2.98s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:59<00:00,  3.43s/it]
 44%|████▍     | 7/16 [2:21:20<3:03:09, 1221.06s/it]

Process genre: mystery


100%|██████████| 7/7 [00:19<00:00,  2.74s/it]


Collected number films id: 350


100%|██████████| 350/350 [20:29<00:00,  3.51s/it]
 50%|█████     | 8/16 [2:42:09<2:43:58, 1229.84s/it]

Process genre: romance


100%|██████████| 7/7 [00:24<00:00,  3.52s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:55<00:00,  3.42s/it]
 56%|█████▋    | 9/16 [3:02:29<2:23:08, 1226.89s/it]

Process genre: sci_fi


100%|██████████| 7/7 [00:15<00:00,  2.20s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:32<00:00,  3.35s/it]
 62%|██████▎   | 10/16 [3:22:18<2:01:30, 1215.01s/it]

Process genre: short


100%|██████████| 7/7 [00:20<00:00,  2.87s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:27<00:00,  3.34s/it]
 69%|██████▉   | 11/16 [3:42:06<1:40:33, 1206.66s/it]

Process genre: sport


100%|██████████| 7/7 [00:19<00:00,  2.81s/it]


Collected number films id: 350


100%|██████████| 350/350 [20:09<00:00,  3.45s/it]
 75%|███████▌  | 12/16 [4:02:34<1:20:53, 1213.40s/it]

Process genre: superhero


100%|██████████| 7/7 [00:27<00:00,  3.91s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:19<00:00,  3.31s/it]
 81%|████████▏ | 13/16 [4:22:21<1:00:16, 1205.35s/it]

Process genre: thriller


100%|██████████| 7/7 [00:19<00:00,  2.81s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:08<00:00,  3.28s/it]
 88%|████████▊ | 14/16 [4:41:50<39:48, 1194.25s/it]  

Process genre: war


100%|██████████| 7/7 [00:19<00:00,  2.80s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:03<00:00,  3.27s/it]
 94%|█████████▍| 15/16 [5:01:13<19:44, 1184.78s/it]

Process genre: western


100%|██████████| 7/7 [00:22<00:00,  3.18s/it]


Collected number films id: 350


100%|██████████| 350/350 [19:42<00:00,  3.38s/it]
100%|██████████| 16/16 [5:21:17<00:00, 1204.85s/it]
