In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [2]:
genre = input('Choose genre:')

Choose genre:comedy


In [3]:
BASE_URL = 'https://www.imdb.com'
search_pattern = '{}/search/title?genres={}'     
search_url = search_pattern.format(BASE_URL, genre)
search_url

'https://www.imdb.com/search/title?genres=comedy'

In [4]:
response = requests.get(search_url)
response

<Response [200]>

In [5]:
soup = BeautifulSoup(response.text, 'lxml')
links = soup.select('h3.lister-item-header a')

In [6]:
uris = [link['href'] for link in links]
movie_urls = [f'{BASE_URL}{uri}' for uri in uris]

In [7]:
def extract_movie (url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml') 
    
    meta_raw = soup.select_one(
        'script[type="application/ld+json"]')
    metadata = json.loads(meta_raw.text)
    return metadata

In [8]:
extract_movie(movie_urls[0])

{'@context': 'http://schema.org',
 '@type': 'Movie',
 'url': '/title/tt5848272/',
 'name': 'Ralph Breaks the Internet',
 'image': 'https://m.media-amazon.com/images/M/MV5BMTYyNzEyNDAzOV5BMl5BanBnXkFtZTgwNTk3NDczNjM@._V1_.jpg',
 'genre': ['Animation', 'Adventure', 'Comedy', 'Family', 'Fantasy'],
 'contentRating': 'PG',
 'actor': [{'@type': 'Person',
   'url': '/name/nm0000604/',
   'name': 'John C. Reilly'},
  {'@type': 'Person', 'url': '/name/nm0798971/', 'name': 'Sarah Silverman'},
  {'@type': 'Person', 'url': '/name/nm2933757/', 'name': 'Gal Gadot'},
  {'@type': 'Person', 'url': '/name/nm0378245/', 'name': 'Taraji P. Henson'}],
 'director': [{'@type': 'Person',
   'url': '/name/nm1601882/',
   'name': 'Phil Johnston'},
  {'@type': 'Person', 'url': '/name/nm0601781/', 'name': 'Rich Moore'}],
 'creator': [{'@type': 'Person',
   'url': '/name/nm1601882/',
   'name': 'Phil Johnston'},
  {'@type': 'Person', 'url': '/name/nm0962596/', 'name': 'Pamela Ribon'},
  {'@type': 'Person', 'url': '

In [9]:

mappings = {
    "title": 'name',
    "description":'description',
    "director": 'director',
    "actors": 'actor',
    "rating": 'aggregateRating',
    "cover_url": 'image',
    'pg': 'contentRating',
    'published': 'datePublished'
}

In [10]:
def map_to_movie_row(data):
    return {
        own_key: data.get(external_key, None) for own_key, external_key in mappings.items()
    } 

In [11]:
example_data = extract_movie(movie_urls[0])

In [12]:
map_to_movie_row(example_data)

{'title': 'Ralph Breaks the Internet',
 'description': 'Ralph Breaks the Internet is a movie starring John C. Reilly, Sarah Silverman, and Gal Gadot. Six years after the events of "Wreck-It Ralph," Ralph and Vanellope, now friends, discover a wi-fi router in their arcade, leading them...',
 'director': [{'@type': 'Person',
   'url': '/name/nm1601882/',
   'name': 'Phil Johnston'},
  {'@type': 'Person', 'url': '/name/nm0601781/', 'name': 'Rich Moore'}],
 'actors': [{'@type': 'Person',
   'url': '/name/nm0000604/',
   'name': 'John C. Reilly'},
  {'@type': 'Person', 'url': '/name/nm0798971/', 'name': 'Sarah Silverman'},
  {'@type': 'Person', 'url': '/name/nm2933757/', 'name': 'Gal Gadot'},
  {'@type': 'Person', 'url': '/name/nm0378245/', 'name': 'Taraji P. Henson'}],
 'rating': {'@type': 'AggregateRating',
  'ratingCount': 18611,
  'bestRating': '10.0',
  'worstRating': '1.0',
  'ratingValue': '7.5'},
 'cover_url': 'https://m.media-amazon.com/images/M/MV5BMTYyNzEyNDAzOV5BMl5BanBnXkFtZTgw

In [13]:
def clean_actors(raw_actors):
    return ", ".join(actor['name'] for actor in raw_actors)

def clean_directors(one_or_many):
    if isinstance(one_or_many, list):
        return clean_actors(one_or_many)
    
    if one_or_many:
        return one_or_many['name']
    
    return "gall ANonim"

cleaners = {
    "actors": clean_actors,
    "rating": lambda x: x['ratingValue'] if x else 0,
    "director": clean_directors,
    'pg': lambda x: x.replace("PG-", "") if x else 18
}

In [14]:
def apply_cleaners(row):
    for key, cleaner in cleaners.items():
        row[key] = cleaner(row[key])
    return row

In [15]:
apply_cleaners(map_to_movie_row(example_data))

{'title': 'Ralph Breaks the Internet',
 'description': 'Ralph Breaks the Internet is a movie starring John C. Reilly, Sarah Silverman, and Gal Gadot. Six years after the events of "Wreck-It Ralph," Ralph and Vanellope, now friends, discover a wi-fi router in their arcade, leading them...',
 'director': 'Phil Johnston, Rich Moore',
 'actors': 'John C. Reilly, Sarah Silverman, Gal Gadot, Taraji P. Henson',
 'rating': '7.5',
 'cover_url': 'https://m.media-amazon.com/images/M/MV5BMTYyNzEyNDAzOV5BMl5BanBnXkFtZTgwNTk3NDczNjM@._V1_.jpg',
 'pg': 'PG',
 'published': '2018-11-21'}

In [16]:
from functools import reduce
def process_movie_url(url):
    pipe = [
        extract_movie,
        map_to_movie_row,
        apply_cleaners
    ]
    
    return reduce(lambda result, fn: fn(result), pipe, url)

    apply_cleaners(map_to_movie_row(extract_movie(url)))

In [17]:
items = (process_movie_url(url) for url in movie_urls)

In [18]:
next(items)

{'title': 'Ralph Breaks the Internet',
 'description': 'Ralph Breaks the Internet is a movie starring John C. Reilly, Sarah Silverman, and Gal Gadot. Six years after the events of "Wreck-It Ralph," Ralph and Vanellope, now friends, discover a wi-fi router in their arcade, leading them...',
 'director': 'Phil Johnston, Rich Moore',
 'actors': 'John C. Reilly, Sarah Silverman, Gal Gadot, Taraji P. Henson',
 'rating': '7.5',
 'cover_url': 'https://m.media-amazon.com/images/M/MV5BMTYyNzEyNDAzOV5BMl5BanBnXkFtZTgwNTk3NDczNjM@._V1_.jpg',
 'pg': 'PG',
 'published': '2018-11-21'}

In [19]:
next(items)

{'title': 'Spider-Man: Into the Spider-Verse',
 'description': 'Spider-Man: Into the Spider-Verse is a movie starring Shameik Moore, Jake Johnson, and Hailee Steinfeld. Miles Morales becomes the Spider-Man of his reality and crosses paths with his counterparts from other dimensions to stop a...',
 'director': 'Bob Persichetti, Peter Ramsey, Rodney Rothman',
 'actors': 'Shameik Moore, Jake Johnson, Hailee Steinfeld, Mahershala Ali',
 'rating': '8.6',
 'cover_url': 'https://m.media-amazon.com/images/M/MV5BMjMzMzQ0NzI5Nl5BMl5BanBnXkFtZTgwNjc2NTY0NjM@._V1_.jpg',
 'pg': 'PG',
 'published': '2018-12-12'}

In [20]:
list(items)

[{'title': 'The Ballad of Buster Scruggs',
  'description': 'The Ballad of Buster Scruggs is a movie starring Tim Blake Nelson, Willie Watson, and Clancy Brown. An anthology film comprising six stories, each dealing with a different aspect of life in the Old West.',
  'director': 'Ethan Coen, Joel Coen',
  'actors': 'Tim Blake Nelson, Willie Watson, Clancy Brown, Danny McCarthy',
  'rating': '7.4',
  'cover_url': 'https://m.media-amazon.com/images/M/MV5BYjRkYTI3M2EtZWQ4Ny00OTA2LWFmMTMtY2E4MTEyZmNjOTMxXkEyXkFqcGdeQXVyNDg4NjY5OTQ@._V1_.jpg',
  'pg': 'R',
  'published': '2018-11-09'},
 {'title': 'The Marvelous Mrs. Maisel',
  'description': 'The Marvelous Mrs. Maisel is a TV series starring Rachel Brosnahan, Tony Shalhoub, and Michael Zegen. A housewife in the 1950s decides to become a stand-up comic.',
  'director': 'gall ANonim',
  'actors': 'Rachel Brosnahan, Tony Shalhoub, Michael Zegen, Marin Hinkle',
  'rating': '8.7',
  'cover_url': 'https://m.media-amazon.com/images/M/MV5BNTBlNWE5

In [21]:
with open('movies.json', 'w+') as f:
    json.dump(list(items), f)