### BeautifulSoup

BeautifulSoup is a Python library, which is used to scrap data from webpages.

In [2]:
import requests
import pandas as pd

from bs4 import BeautifulSoup

import lxml

# Scraping IMDB using Beautiful Soup

We will scrap the International Movies Database (IMDB) at [imdb.com](https://imdb.com) for the 250 top movies ever made. 

### Example

This is an example of the expected result of scraping the webpage: https://www.imdb.com/title/tt0111161/

<img src="images/imdb.png">

In [3]:
# Search the IMDB websize for the TOP 250 movies

headers = {
    'Accept-Language': 'en; q=1.0',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
response = requests.get('https://www.imdb.com/chart/top/?ref_=nv_mv_1000', headers=headers)

# Throw a warning for non-200 status codes
if response.status_code != 200:
    warn('Request: {}; Status code: {}'.format(requests, response.status_code))

# Parse the content of the request with BeautifulSoup
page = BeautifulSoup(response.text, features='lxml')

### Get the header

In [4]:
page.h1

<h1 class="ipc-title__text chart-layout-specific-title-text ipc-title__text--reduced">IMDb Top 250 movies</h1>

### Get the first svg-image

In [5]:
page.svg

<svg class="ipc-logo" height="32" id="home_img" version="1.1" viewbox="0 0 64 32" width="64" xmlns="http://www.w3.org/2000/svg"><g fill="#F5C518"><rect height="100%" rx="4" width="100%" x="0" y="0"></rect></g><g fill="#000000" fill-rule="nonzero" transform="translate(8.000000, 7.000000)"><polygon points="0 18 5 18 5 0 0 0"></polygon><path d="M15.6725178,0 L14.5534833,8.40846934 L13.8582008,3.83502426 C13.65661,2.37009263 13.4632474,1.09175121 13.278113,0 L7,0 L7,18 L11.2416347,18 L11.2580911,6.11380679 L13.0436094,18 L16.0633571,18 L17.7583653,5.8517865 L17.7707076,18 L22,18 L22,0 L15.6725178,0 Z"></path><path d="M24,18 L24,0 L31.8045586,0 C33.5693522,0 35,1.41994415 35,3.17660424 L35,14.8233958 C35,16.5777858 33.5716617,18 31.8045586,18 L24,18 Z M29.8322479,3.2395236 C29.6339219,3.13233348 29.2545158,3.08072342 28.7026524,3.08072342 L28.7026524,14.8914865 C29.4312846,14.8914865 29.8796736,14.7604764 30.0478195,14.4865461 C30.2159654,14.2165858 30.3021941,13.486105 30.3021941,12.287163

### Find all option-tags

In [6]:
page.find_all("option")

[<option selected="" value="RANKING">Ranking</option>,
 <option value="USER_RATING">IMDb rating</option>,
 <option value="RELEASE_DATE">Release date</option>,
 <option value="USER_RATING_COUNT">Number of ratings</option>,
 <option value="TITLE_REGIONAL">Alphabetical</option>,
 <option value="POPULARITY">Popularity</option>,
 <option value="RUNTIME">Runtime</option>]

### Find one element by class

In [7]:
page.find("div", class_="cli-title")

<div class="ipc-title ipc-title--base ipc-title--title ipc-title--title--reduced ipc-title-link-no-icon ipc-title--on-textPrimary sc-87337ed2-2 dRlLYG cli-title with-margin"><a class="ipc-title-link-wrapper" href="/title/tt0111161/?ref_=chttp_t_1" tabindex="0"><h3 class="ipc-title__text ipc-title__text--reduced">1. The Shawshank Redemption</h3></a></div>

In [8]:
page.find("div", class_="cli-title").a.text

'1. The Shawshank Redemption'

### Find all elements by class

In [9]:
for td in page.find_all("div", class_="cli-title"):
    print (td.a.text)

1. The Shawshank Redemption
2. The Godfather
3. The Dark Knight
4. The Godfather Part II
5. 12 Angry Men
6. The Lord of the Rings: The Return of the King
7. Schindler's List
8. The Lord of the Rings: The Fellowship of the Ring
9. Pulp Fiction
10. The Good, the Bad and the Ugly
11. The Lord of the Rings: The Two Towers
12. Forrest Gump
13. Fight Club
14. Inception
15. Star Wars: Episode V - The Empire Strikes Back
16. The Matrix
17. Goodfellas
18. Interstellar
19. One Flew Over the Cuckoo's Nest
20. Se7en
21. It's a Wonderful Life
22. The Silence of the Lambs
23. Seven Samurai
24. Saving Private Ryan
25. The Green Mile


# Now, we will parse the full list of TOP-250 movies

In [10]:
# Parse the movies list
top_movies = []
for i, li in enumerate(page.find_all("li", class_="ipc-metadata-list-summary-item")):
    # parse Title
    title = li.find("h3", class_="ipc-title__text").text
    
    # parse HREF
    href = li.find("a", class_="ipc-title-link-wrapper")["href"].split('?')[0]
    
    # parse Rating
    rating = li.find("span", class_="ipc-rating-star--rating").text
    
    top_movies.append(
        {
            "title" : title,
            "rating" : rating,
            "href" : href,            
        }
    )
        
movies = pd.DataFrame(top_movies)    
movies

Unnamed: 0,title,rating,href
0,1. The Shawshank Redemption,9.3,/title/tt0111161/
1,2. The Godfather,9.2,/title/tt0068646/
2,3. The Dark Knight,9.1,/title/tt0468569/
3,4. The Godfather Part II,9.0,/title/tt0071562/
4,5. 12 Angry Men,9.0,/title/tt0050083/
5,6. The Lord of the Rings: The Return of the King,9.0,/title/tt0167260/
6,7. Schindler's List,9.0,/title/tt0108052/
7,8. The Lord of the Rings: The Fellowship of th...,8.9,/title/tt0120737/
8,9. Pulp Fiction,8.8,/title/tt0110912/
9,"10. The Good, the Bad and the Ugly",8.8,/title/tt0060196/


In [11]:
# Write JSON to file
movies.to_json("example_crawl_top_movies.json", force_ascii=False)