# Web Scrapper - IMDB Top 250 movies

In [1]:
# Libraries
import requests as rq
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
print('Libraries ready 2 go')

Libraries ready 2 go


In [2]:
# Saving the url
url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
# Creating the request
page = rq.get(url)
# Making sure the web got the solicitude 
page.status_code 

200

In [33]:
# Plain text from the web
# page.text # ugly

In [4]:
# Creating the soup
soup = BeautifulSoup(page.text, 'lxml')

In [32]:
# Plain text from the web
# print(soup.prettify()) # Pretty

In [34]:
# looks like the "lister" tag has all the information we need
general = soup.find('div', attrs = {'class':'lister'})
# But we still need to go deeper
general = soup.find('tbody', attrs = {'class':'lister-list'}).find_all('tr')

In [35]:
# Seems like all the information we need are in these tags, now we have to extract them
# Creating a list with all the links
section = general[0]
index = 1
link_sections = [section.a.get('href') for section in general]
# All movies have the below ref
common = '?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=MBY5AS7XQMWKBXS6Y8N7&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_'
links = []
for i in link_sections:
    link = ('https://www.imdb.com' + str(i) + common + str (index)) # It is important the https -> Schema
    links.append(link)
    index += 1

In [8]:
# Let's extract the index, rating, number of qualifeirs, release year, movie name, prot and director 
index = index = section.find('span', attrs = {'name':'rk'}).get('data-value')
rating = section.find('span', attrs = {'name':'ir'}).get('data-value')
qualifiers = section.find('span', attrs = {'name':'nv'}).get('data-value')
year = section.find('span', attrs = {'class':'secondaryInfo'}).get_text()
movie_name = section.find('td', attrs = {'class':'titleColumn'}).a.get_text()
prot_direc = section.find('td', attrs = {'class':'titleColumn'}).a.get('title')

print('This is the name of the movie: {}'.format(movie_name))
print('This is the name of the protagonists and directors: {}'.format(prot_direc))
print('This is the index: {}'.format(index))
print('This is the rating: {}'.format(rating))
print('This is the number of qualifiers: {}'.format(qualifiers))
print('This is the release year: {}'.format(year))


This is the name of the movie: The Shawshank Redemption
This is the name of the protagonists and directors: Frank Darabont (dir.), Tim Robbins, Morgan Freeman
This is the index: 1
This is the rating: 9.233473393603022
This is the number of qualifiers: 2575422
This is the release year: (1994)


In [17]:
# Now let's try to create the lists for everything!
movie_names = [section.find('td', attrs = {'class':'titleColumn'}).a.get_text() for section in general]
ratings = [section.find('span', attrs = {'name':'ir'}).get('data-value') for section in general]
qualifiers = [section.find('span', attrs = {'name':'nv'}).get('data-value') for section in general]
year = [section.find('span', attrs = {'class':'secondaryInfo'}).get_text() for section in general]
protagonist_dir = [section.find('td', attrs = {'class':'titleColumn'}).a.get('title') for section in general]

# Working with errors

In [18]:
# What if a link doesn't work anymore? And what if it does? Maybe we can get more information..
one_link = links[1]
one_link

'https://www.imdb.com/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=1a264172-ae11-42e4-8ef7-7fed1973bb8f&pf_rd_r=MBY5AS7XQMWKBXS6Y8N7&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2'

In [19]:
# Seems like we can also put summary into our future dataset!
link = rq.get(one_link)
s_link = BeautifulSoup(link.text, 'lxml')
s_link.find('p', attrs = {'data-testid':'plot'}).get_text()

'The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son.The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son.The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son.'

In [29]:
# Since we are entering new links, we should be carefull and play with try except blocks
try:
    link = rq.get(one_link)
    if link.status_code == 200:
        s_link = BeautifulSoup(link.text, 'lxml')
        # Extrating the summary
        summary = s_link.find('p', attrs = {'data-testid':'plot'}).get_text()
        print(summary)
        
except Exception as e:
    print('Error')
    print(e)
    print('\n')

The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son.The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son.The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son.


In [22]:
# like this
def summary(general, links):
    summary_list = []
    counter = 1
    for i in links: 
        try:
            test = rq.get(i)
            if test.status_code == 200:
                s_test = BeautifulSoup(test.text, 'lxml')
                name = s_test.find('h1', attrs = {'data-testid':'hero-title-block__title'}).get_text()
                print('Scrapping movie number {}: ** {} **'.format(counter, name))
                # Extracting summary
                summary = s_test.find('p', attrs = {'data-testid':'plot'}).get_text()
                if summary:
                    summary_list.append(summary)
                else:
                    summary_list.append('None')
                counter += 1
        except Exception as e:
            print('Error scraping {}'.format(i))
            print('Status_Code = {}'.format(test.status_code))
            print(e)
            summary_list.append('None')
            counter += 1
    return summary_list
summary_list = summary(general,links)

Scrapping movie number 1: ** The Shawshank Redemption **
Scrapping movie number 2: ** El Padrino **
Scrapping movie number 3: ** The Dark Knight **
Scrapping movie number 4: ** El padrino: Parte II **
Scrapping movie number 5: ** 12 Angry Men **
Scrapping movie number 6: ** Schindler's List **
Scrapping movie number 7: ** The Lord of the Rings: The Return of the King **
Scrapping movie number 8: ** Pulp Fiction **
Scrapping movie number 9: ** The Lord of the Rings: The Fellowship of the Ring **
Scrapping movie number 10: ** El bueno, El malo y El feo **
Scrapping movie number 11: ** Forrest Gump **
Scrapping movie number 12: ** Fight Club **
Scrapping movie number 13: ** El origen **
Scrapping movie number 14: ** The Lord of the Rings: The Two Towers **
Scrapping movie number 15: ** El imperio contraataca **
Scrapping movie number 16: ** The Matrix **
Scrapping movie number 17: ** Goodfellas **
Scrapping movie number 18: ** One Flew Over the Cuckoo's Nest **
Scrapping movie number 19: 

Scrapping movie number 136: ** El laberinto del fauno **
Scrapping movie number 137: ** Casino **
Scrapping movie number 138: ** Ran **
Scrapping movie number 139: ** Monty Python and the Holy Grail **
Scrapping movie number 140: ** Una mente brillante **
Scrapping movie number 141: ** There Will Be Blood **
Scrapping movie number 142: ** El Sexto Sentido **
Scrapping movie number 143: ** Yôjinbô **
Scrapping movie number 144: ** The Truman Show **
Scrapping movie number 145: ** The Treasure of the Sierra Madre **
Scrapping movie number 146: ** Rashômon **
Scrapping movie number 147: ** The Great Escape **
Scrapping movie number 148: ** Shutter Island **
Scrapping movie number 149: ** Kill Bill: Vol. 1 **
Scrapping movie number 150: ** Jurassic Park **
Scrapping movie number 151: ** No Country for Old Men **
Scrapping movie number 152: ** The Elephant Man **
Scrapping movie number 153: ** Raging Bull **
Scrapping movie number 154: ** Finding Nemo **
Scrapping movie number 155: ** China

In [30]:
# Adding all the information in one huge dictionary..
top_250_movies = {}
top_250_movies['movie_name'] = movie_names
top_250_movies['rating'] = ratings
top_250_movies['qualifiers'] = qualifiers
top_250_movies['release_year'] = year
top_250_movies['protagonist_dir'] = protagonist_dir
top_250_movies['IMDB_link'] = links
top_250_movies['Summary'] = summary_list

In [31]:
# Creating the data frame
df = pd.DataFrame(top_250_movies)

In [25]:
# Let's see what we have here...
df.head()

Unnamed: 0,movie_name,rating,qualifiers,release_year,protagonist_dir,IMDB_link,Summary
0,The Shawshank Redemption,9.233473393603022,2575422,(1994),"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",https://www.imdb.com/title/tt0111161/?pf_rd_m=...,Two imprisoned men bond over a number of years...
1,El Padrino,9.155447858835135,1773370,(1972),"Francis Ford Coppola (dir.), Marlon Brando, Al...",https://www.imdb.com/title/tt0068646/?pf_rd_m=...,The aging patriarch of an organized crime dyna...
2,The Dark Knight,8.984063237648467,2542872,(2008),"Christopher Nolan (dir.), Christian Bale, Heat...",https://www.imdb.com/title/tt0468569/?pf_rd_m=...,When the menace known as the Joker wreaks havo...
3,El padrino: Parte II,8.98377124965592,1226213,(1974),"Francis Ford Coppola (dir.), Al Pacino, Robert...",https://www.imdb.com/title/tt0071562/?pf_rd_m=...,The early life and career of Vito Corleone in ...
4,12 Angry Men,8.946286749090616,760615,(1957),"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",https://www.imdb.com/title/tt0050083/?pf_rd_m=...,The jury in a New York City murder trial is fr...


In [None]:
# And finally we export the data set to csv
df.to_csv('top_imdb_250_movies.csv')