I want to use Beautiful Soup to extract information from the website rottentomatoes.com. At first we make the soup:
* that means passing the path to our HTML file into file handle
* then passing that file handle into Beautiful Soup constructor

In [33]:
# besides BeautifulSoup (helps to pull out data from HTML and XML files, 
# we have to use requests library to access the web page
import requests
from bs4 import BeautifulSoup
import os

In [34]:
#use request to get on a url
url = 'https://www.rottentomatoes.com/top/bestofrt/?year=2018'
response = requests.get(url)
#what does the response veraiable look like?
response


<Response [200]>

above is the HTTP status code: request has succeeded.<br> 
We check out the content with `.content`

In [35]:
response.content

b'<!DOCTYPE html>\n<html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/" >\n\t<head prefix="og: http://ogp.me/ns# flixstertomatoes: http://ogp.me/ns/apps/flixstertomatoes#">\n    <script src="//cdn.optimizely.com/js/594670329.js"></script>\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n\n    <meta name="google-site-verification" content="VPPXtECgUUeuATBacnqnCm4ydGO99reF-xgNklSbNbc" />\n\n    <meta name="msvalidate.01" content="034F16304017CA7DCF45D43850915323" />\n\n    <link href="https://staticv2-4.rottentomatoes.com/static/images/iphone/apple-touch-icon.png" rel="apple-touch-icon" />\n    <link href="https://staticv2-4.rottentomatoes.com/static/images/icons/favicon.ico" rel="shortcut icon" type="image/x-icon" />\n    <link href="https://staticv2-4.rottentomatoes.com/static/styles/css/rt_main.css" rel="stylesheet" />\n\n    <scri

In [36]:
# Reading the HTML response content directly into a `BeautifulSoup` constructor
# Work with HTML in memory (not downloading it)
soup = BeautifulSoup(response.content, 'lxml')

In [42]:
#find the links to the movies
movie_links = soup.find_all('a')[195:295]
movie_links

[<a class="unstyled articleLink" href="/m/black_panther_2018">
             Black Panther (2018)</a>,
 <a class="unstyled articleLink" href="/m/mission_impossible_fallout">
             Mission: Impossible - Fallout (2018)</a>,
 <a class="unstyled articleLink" href="/m/blackkklansman">
             BlacKkKlansman (2018)</a>,
 <a class="unstyled articleLink" href="/m/spider_man_into_the_spider_verse">
             Spider-Man: Into the Spider-Verse (2018)</a>,
 <a class="unstyled articleLink" href="/m/roma_2018">
             Roma (2018)</a>,
 <a class="unstyled articleLink" href="/m/a_star_is_born_2018">
             A Star Is Born (2018)</a>,
 <a class="unstyled articleLink" href="/m/a_quiet_place_2018">
             A Quiet Place (2018)</a>,
 <a class="unstyled articleLink" href="/m/can_you_ever_forgive_me">
             Can You Ever Forgive Me? (2018)</a>,
 <a class="unstyled articleLink" href="/m/eighth_grade">
             Eighth Grade (2018)</a>,
 <a class="unstyled articleLink" h

In [6]:
# try to extract one url out of 'movie_links'
movie_links[17].get('href')

'/m/first_reformed'

In [43]:
# extract url's and collect them in a list
df_list = []
x = 0
while x < 100:
    url = movie_links[x].get('href')
    x = x + 1
    df_list.append(url)

In [44]:
df_list

['/m/black_panther_2018',
 '/m/mission_impossible_fallout',
 '/m/blackkklansman',
 '/m/spider_man_into_the_spider_verse',
 '/m/roma_2018',
 '/m/a_star_is_born_2018',
 '/m/a_quiet_place_2018',
 '/m/can_you_ever_forgive_me',
 '/m/eighth_grade',
 '/m/paddington_2',
 '/m/call_me_by_your_name',
 '/m/incredibles_2',
 '/m/leave_no_trace',
 '/m/the_favourite_2018',
 '/m/widows_2018',
 '/m/wont_you_be_my_neighbor',
 '/m/shoplifters',
 '/m/the_death_of_stalin',
 '/m/the_hate_u_give',
 '/m/ant_man_and_the_wasp',
 '/m/crazy_rich_asians',
 '/m/sorry_to_bother_you_2018',
 '/m/phantom_thread',
 '/m/first_man',
 '/m/i_tonya',
 '/m/the_post',
 '/m/isle_of_dogs_2018',
 '/m/free_solo',
 '/m/avengers_infinity_war',
 '/m/the_rider',
 '/m/hereditary',
 '/m/mcqueen',
 '/m/three_identical_strangers',
 '/m/summer_1993',
 '/m/the_guilty_2018',
 '/m/the_old_man_and_the_gun',
 '/m/minding_the_gap',
 '/m/first_reformed',
 '/m/bumblebee',
 '/m/wildlife_2018',
 '/m/a_fantastic_woman',
 '/m/searching_2018',
 '/m/tea_

In [53]:
# open web pages with the links from df_list and get the film title
movie_list = []
for film in df_list:
    url = ('https://www.rottentomatoes.com'+film)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
    print(title)
    break

Black Panther (2018)


In [54]:
# open web pages with the links from df_list and get the genre
movie_list = []
for film in df_list:
    url = ('https://www.rottentomatoes.com'+film)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
    genre = soup.find_all('ul')[13].find('a').get_text()
    print(genre)
    break

Action & Adventure


In [55]:
# open web pages with the links from df_list and get the director
movie_list = []
for film in df_list:
    url = ('https://www.rottentomatoes.com'+film)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
    genre = soup.find_all('ul')[13].find('a').get_text()
    director = soup.find_all('ul')[13].find_all('li')[2].find('a').get_text()
    print(director)
    break

Ryan Coogler


In [57]:
# open web pages with the links from df_list and get the runtime
movie_list = []
for film in df_list:
    url = ('https://www.rottentomatoes.com'+film)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
    genre = soup.find_all('ul')[13].find('a').get_text()
    director = soup.find_all('ul')[13].find_all('li')[2].find('a').get_text()
    runtime = soup.find_all('ul')[13].find_all('li')[7].find('time').get_text().strip()[:-len(' minutes')]
    print(runtime)
    break

135


Our last task is to convert our list of dictionaries df_list to a pandas dataframe. Don't forget to import the pandas library. And we also specify the column order

In [150]:
import pandas as pd
movie_list = []
for film in df_list:
    url = ('https://www.rottentomatoes.com'+film)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
    genre = soup.find_all('ul')[13].find('a').get_text()
    director = soup.find_all('ul')[13].find_all('li')[2].find('a').get_text()
    runtime = soup.find_all('ul')[13].find_all('li')[7].find('time').get_text().strip()[:-len(' minutes')]
    movie_list.append({'title': title,
                        'genre': genre,
                        'director': director,
                        'runtime': runtime})
df = pd.DataFrame(movie_list, columns = ['title', 'genre', 'director', 'runtime'])

AttributeError: 'NoneType' object has no attribute 'get_text'

How can we handle this error?<br>
We can use the ["try & except"](https://stackoverflow.com/questions/28387221/attributeerror-nonetype-object-has-no-attribute-get-text) method to solve the problem.

In [48]:
import pandas as pd
movie_list = []
for film in df_list:
    url = ('https://www.rottentomatoes.com'+film)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
    genre = soup.find_all('ul')[13].find('a').get_text()
    try: 
        director = soup.find_all('ul')[13].find_all('li')[2].find('a').get_text()
    except AttributeError:
        director = 'no director'
    runtime = soup.find_all('ul')[13].find_all('li')[5:8]
    runtime = soup.find_all('time')[-1].get_text().strip()[:-len(' minutes')]    
    movie_list.append({'title': title,
                        'genre': genre,
                        'director': director,
                        'runtime': runtime})
df = pd.DataFrame(movie_list, columns = ['title', 'genre', 'director', 'runtime'])

In [56]:
df

Unnamed: 0,title,genre,director,runtime
0,Black Panther (2018),Action & Adventure,Ryan Coogler,135
1,Mission: Impossible - Fallout (2018),Action & Adventure,Christopher McQuarrie,147
2,BlacKkKlansman (2018),Comedy,Spike Lee,135
3,Spider-Man: Into the Spider-Verse (2018),Action & Adventure,Bob Persichetti,100
4,Roma (2018),Drama,Alfonso Cuarón,135
5,A Star Is Born (2018),Drama,Bradley Cooper,135
6,A Quiet Place (2018),Drama,John Krasinski,90
7,Can You Ever Forgive Me? (2018),Comedy,Marielle Heller,107
8,Eighth Grade (2018),Comedy,Bo Burnham,94
9,Paddington 2 (2018),Animation,Paul King (VII),105


In [52]:
#save df to csv file
df.to_csv('data_rottentomatoes.csv', index=False)