# Web Scraping the Top 250 Movies on IMDB

description of project from proposal.

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd


### Part I: Web Scraping Tool
Creating a function that web scrapes from multiple links, parses the HTML, and stores in a pandas DataFrame.

In [2]:
def web_scrape_page(url):
    resp = requests.get(url)
    
    #All info from webpage scraped
    soup = BeautifulSoup(resp.text,'html.parser')
    
    #Scraping header: movie title and year
    scraped_titles = soup.find_all('h3',{"class":"lister-item-header"})
    
    #Scraping for movie title
    movies = []
    for x in scraped_titles:
        for y in x.find_all('a'):
            movies.append(y.text)
    
    #Scraping for movie year
    years = []
    for x in scraped_titles:
        for y in x.find_all('span'):
            years.append(y.text)
    years = years[1::2]
    
    #Scraping from muted text, which is where the movie details are stored: rating, time, genres
    scraped_details = soup.find_all('p',{"class":"text-muted"})
    details = []
    for x in scraped_details:
        for y in x.find_all('span'):
            details.append(y.text)
    
    #Scraping rating
    ratings = details[0::5]
    
    #Scraping movie duration
    t = details[2::5]
    time = []
    for i in t:
        var = i.strip(" min")
        time.append(var)
    
    #scraping genre
    g = details[4::5]
    genres = []
    for i in range(len(g)):
        genres.append(g[i].strip())
    
    #Scraping from ratings bar
    scraped_ratings_bar = soup.find_all('div',{"class":"ratings-bar"})
    s = []
    for x in scraped_ratings_bar:
        for y in x.find_all('div',{"class":"inline-block ratings-imdb-rating"}):
            s.append(y.text)
    stars = []
    for i in s:
        stars.append(i.strip("\n"))
        
    #Scraping directors 
    directors = []
    for element in soup.find_all(text=re.compile("Director")):
        a = element.nextSibling
        directors.append(a.text)
    
    #Scraping actors
    actors = []
    for element in soup.find_all(text=re.compile("Stars")):
        a = element.nextSibling
        actors.append(a.text)
        
    #Scraping from num votes, which is where votes and gross $ are stored
    scraped_nums = soup.find_all('p',{"class":"sort-num_votes-visible"})
    nums = []
    for x in scraped_nums:
        for y in x.find_all('span'):
            nums.append(y.text)
    nums = np.array(nums)
    m = []
    for x in scraped_ratings_bar:
        for y in x.find_all('div',{"class":"inline-block ratings-metascore"}):
            m.append(y.text)
    metascore = []

    for i in range(len(m)):
        metascore.append(int(((((((m[i].strip("\n")).strip(" ")).strip("\n")).strip("Metascore")).strip(" ")).strip("\n")).strip(" ")))
    searchval = "Votes:"
    ind = np.where(nums == searchval)[0]+1
    votes = nums[ind]
    searchval = "Gross:"
    ind = np.where(nums == searchval)[0]+1
    gr = nums[ind]
    gross = []

    for i in range(len(gr)):
        a = gr[i].strip("$")
        gross.append(a.strip("M"))
        
    #Creating DataFrame
    df = pd.DataFrame([movies,years,ratings,time,genres,stars,metascore,votes,gross,directors,actors]).transpose()
    df.columns = ['Movie', 'Year', 'Rating', 'Duration (min)', 'Genres','Stars','Metascore','Votes','Gross ($M)', 'Director','Lead']
    
    return df

The movies are spread across 5 different webpages. We use our function to scrape each page and concatenate the resulting DataFrames, reindexing the final DataFrame so each movie has it's unique identifier. 

In [20]:
pg1 = web_scrape_page("https://www.imdb.com/search/title/?groups=top_250&sort=user_rating")
pg2 = web_scrape_page("https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=51&ref_=adv_nxt")
pg3 = web_scrape_page("https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=101&ref_=adv_nxt")
pg4 = web_scrape_page("https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=151&ref_=adv_nxt")
pg5 = web_scrape_page("https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=201&ref_=adv_nxt")

frames = [pg1, pg2, pg3, pg4, pg5]
movies = pd.concat(frames)
movies = movies.reset_index(drop=True)

In [22]:
movies.head()

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead
0,The Shawshank Redemption,(1994),R,142,Drama,9.3,80,2158515,28.34,Frank Darabont,Tim Robbins
1,The Godfather,(1972),R,175,"Crime, Drama",9.2,100,1481332,134.97,Francis Ford Coppola,Marlon Brando
2,The Dark Knight,(2008),PG-13,152,"Action, Crime, Drama",9.0,84,2133178,534.86,Christopher Nolan,Christian Bale
3,The Godfather: Part II,(1974),R,202,"Crime, Drama",9.0,90,1031396,57.3,Francis Ford Coppola,Al Pacino
4,The Lord of the Rings: The Return of the King,(2003),PG-13,201,"Adventure, Drama, Fantasy",8.9,94,1532805,377.85,Peter Jackson,Elijah Wood


In [23]:
movies.tail()

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead
245,"Monsters, Inc.",(2001),G,92,"Animation, Adventure, Comedy",8.0,,759862,,Pete Docter,Billy Crystal
246,Aladdin,(1992),G,90,"Animation, Adventure, Comedy",8.0,,337549,,Ron Clements,Scott Weinger
247,Castle in the Sky,(1986),PG,125,"Animation, Adventure, Drama",8.0,,131507,,Hayao Miyazaki,Anna Paquin
248,The Terminator,(1984),R,107,"Action, Sci-Fi",8.0,,750856,,James Cameron,Arnold Schwarzenegger
249,Butch Cassidy and the Sundance Kid,(1969),PG,110,"Biography, Crime, Drama",8.0,,191192,,George Roy Hill,Paul Newman


### Part II: Data Cleaning

We would like Year to be an integer so we could easily bin for different decades in our visualization. We strip away the paranthesis from the Year series and replace it in the movies DataFrame. 

In [24]:
#year cleaning
a = []
b = []


In [25]:
for i in range(len(movies['Year'])):
    a.append(movies['Year'].iloc[i].strip("("))
    b.append(a[i].strip(")"))

In [26]:
movies['Year'] = b

In [50]:
movies.head()

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead
0,The Shawshank Redemption,1994,R,142,Drama,9.3,80,2158515,28.34,Frank Darabont,Tim Robbins
1,The Godfather,1972,R,175,"Crime, Drama",9.2,100,1481332,134.97,Francis Ford Coppola,Marlon Brando
2,The Dark Knight,2008,PG-13,152,"Action, Crime, Drama",9.0,84,2133178,534.86,Christopher Nolan,Christian Bale
3,The Godfather: Part II,1974,R,202,"Crime, Drama",9.0,90,1031396,57.3,Francis Ford Coppola,Al Pacino
4,The Lord of the Rings: The Return of the King,2003,PG-13,201,"Adventure, Drama, Fantasy",8.9,94,1532805,377.85,Peter Jackson,Elijah Wood


Some of the data is not clean. There are 5 movies in which the movie name was included in the Year scraping. These 5 instances have to be dealt with individually before we can convert the whole Year column in the movies DataFrame to integers. 

In [28]:
#Example
movies[movies['Movie']== 'Coco']

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead
60,Coco,I) (2017,PG,105,"Animation, Adventure, Family",8.4,67,305099,162.81,Lee Unkrich,Anthony Gonzalez


In [51]:
#All instances of dirty Year Data
movies['Year'][movies['Year'].str.startswith('I')]

124    I) (2015
171    I) (2015
178    I) (2013
179    I) (2015
Name: Year, dtype: object

In [52]:
movies.loc[60,'Year'] = '2017'
movies.loc[124,'Year'] = '2015'
movies.loc[171,'Year'] = '2015'
movies.loc[178,'Year'] = '2013'
movies.loc[179,'Year'] = '2015'

In [54]:
years = [];

for i in range(len(movies['Year'])):
    years.append(int(movies['Year'].iloc[i]))

movies['Year'] = years                 

Now, all the Year data is clean and converted to integers. Below is an example.

In [55]:
movies.loc[124]

Movie                               Inside Out
Year                                      2015
Rating                                      PG
Duration (min)                              95
Genres            Animation, Adventure, Comedy
Stars                                      8.2
Metascore                                   88
Votes                                  555,771
Gross ($M)                               13.66
Director                           Pete Docter
Lead                               Amy Poehler
Name: 124, dtype: object

Dealing with movies with multiple genres.