# Web Scraping the Top 250 Movies on IMDB

description of project from proposal.

In [2]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd
from selenium import webdriver
#pip install selenium


### Part I: Web Scraping Tool
Creating a function that web scrapes from current page (using selenium), parses the HTML, and stores in a pandas DataFrame.

In [253]:
def web_scrape_page():
    
    url = driver.current_url
    resp = requests.get(url)
    
    #All info from current driver.page_source scraped
    soup = BeautifulSoup(resp.text,'html.parser')
    
    #Scraping header: movie title and year
    scraped_titles = soup.find_all('h3',{"class":"lister-item-header"})
    
    #Scraping for movie title
    movies = []
    for x in scraped_titles:
        for y in x.find_all('a'):
            movies.append(y.text)
    
    #Scraping for movie year
    years = []
    for x in scraped_titles:
        for y in x.find_all('span'):
            years.append(y.text)
    years = years[1::2]
    
    #Scraping from muted text, which is where the movie details are stored: rating, time, genres
    scraped_details = soup.find_all('p',{"class":"text-muted"})
    details = []
    for x in scraped_details:
        for y in x.find_all('span'):
            details.append(y.text)
    
    #Scraping rating, length of film and genres
    rating_types = np.unique(details[0::5])
    correct_rating_types = []

    for i in range(0,len(rating_types)):
        if 'min' not in rating_types[i]:
            correct_rating_types.append(rating_types[i])
    
    ratings = []
    time = []
    genres = []

    for i in range(0,len(details),5):
        if details[i] in correct_rating_types:
            ratings.append(details[i])
            time.append(details[i+2].strip(' min'))
            genres.append(details[i+4].strip())
        else:
            if details[i-2] == '|':
                ratings.append('Not Rated')
            else:
                ratings.append(details[i-2])
            time.append(details[i].strip(' min'))
            genres.append(details[i+2].strip())

    
    #Scraping from ratings bar
    scraped_ratings_bar = soup.find_all('div',{"class":"ratings-bar"})
    s = []
    for x in scraped_ratings_bar:
        for y in x.find_all('div',{"class":"inline-block ratings-imdb-rating"}):
            s.append(y.text)
    stars = []
    for i in s:
        stars.append(i.strip("\n"))
        
    #Scraping directors 
    directors = []
    for element in soup.find_all(text=re.compile("Director")):
        a = element.nextSibling
        directors.append(a.text)
    
    #Scraping actors
    actors = []
    for element in soup.find_all(text=re.compile("Stars")):
        a = element.nextSibling
        if a is None:
            actors.append('None')
        else:
            actors.append(a.string)
        
    #Scraping from num votes, which is where votes and gross $ are stored
    scraped_nums = soup.find_all('p',{"class":"sort-num_votes-visible"})
    nums = []
    for x in scraped_nums:
        for y in x.find_all('span'):
            nums.append(y.text)
    nums = np.array(nums)
    m = []
    for x in scraped_ratings_bar:
        for y in x.find_all('div',{"class":"inline-block ratings-metascore"}):
            m.append(y.text)
    metascore = []

    for i in range(len(m)):
        metascore.append(int(((((((m[i].strip("\n")).strip(" ")).strip("\n")).strip("Metascore")).strip(" ")).strip("\n")).strip(" ")))
    searchval = "Votes:"
    ind = np.where(nums == searchval)[0]+1
    votes = nums[ind]
    searchval = "Gross:"
    ind = np.where(nums == searchval)[0]+1
    gr = nums[ind]
    gross = []

    for i in range(len(gr)):
        a = gr[i].strip("$")
        gross.append(a.strip("M"))
        
    #Creating DataFrame
    df = pd.DataFrame([movies,years,ratings,time,genres,stars,metascore,votes,gross,directors,actors]).transpose()
    df.columns = ['Movie', 'Year', 'Rating', 'Duration (min)', 'Genres','Stars','Metascore','Votes','Gross ($M)', 'Director','Lead']
    
    return df

The movies are spread across 5 different webpages. We use our function to scrape each page and concatenate the resulting DataFrames, reindexing the final DataFrame so each movie has it's unique identifier. 

In [226]:
pd.set_option('display.max_rows', None)

In [227]:
from selenium.common.exceptions import NoSuchElementException
import time

In [228]:
#Initializing pg DataFrame that will eventually contain all scraped information
pg = pd.DataFrame()

#Using selenium's webdriver to create basic Chrome Options when opening browser
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument("--test-type")

#Using selenium's webdriver to open Google Chrome with stated options 
#Goes directly to url which is the first page of IMDB Top 250 list  
driver = webdriver.Chrome(options=options)
driver.get("https://www.imdb.com/search/title/?groups=top_250&sort=user_rating")

#Scrape page until there isn't a 'Next »' button
#Merge previous pages (pg) to current page (pg_current) until no pages are remaining
pages_remaining = True

while pages_remaining == True:

    pg_current = web_scrape_page()
    pg = pd.concat([pg,pg_current])

    try:
        next_link = driver.find_element_by_xpath('//a[contains(.,"Next »")]')
        next_link.click()
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(30)

    except NoSuchElementException:
        pages_remaining = False

#Close browser
driver.close()

In [294]:
movies = pg.reset_index(drop=True)
movies.head()

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead
0,The Shawshank Redemption,(1994),R,142,Drama,9.3,80,2166683,28.34,Frank Darabont,Tim Robbins
1,The Godfather,(1972),R,175,"Crime, Drama",9.2,100,1487716,134.97,Francis Ford Coppola,Marlon Brando
2,The Dark Knight,(2008),PG-13,152,"Action, Crime, Drama",9.0,84,2141633,534.86,Christopher Nolan,Christian Bale
3,The Godfather: Part II,(1974),R,202,"Crime, Drama",9.0,90,1036159,57.3,Francis Ford Coppola,Al Pacino
4,The Lord of the Rings: The Return of the King,(2003),PG-13,201,"Adventure, Drama, Fantasy",8.9,94,1538312,377.85,Peter Jackson,Elijah Wood


### Part II: Data Cleaning

We would like Year to be an integer so we could easily bin for different decades in our visualization. We strip away the paranthesis from the Year series and replace it in the movies DataFrame. 

In [295]:
movies['Year'] = movies['Year'].str.replace('(','')
movies['Year'] = movies['Year'].str.replace(')','')

In [296]:
movies.head()

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead
0,The Shawshank Redemption,1994,R,142,Drama,9.3,80,2166683,28.34,Frank Darabont,Tim Robbins
1,The Godfather,1972,R,175,"Crime, Drama",9.2,100,1487716,134.97,Francis Ford Coppola,Marlon Brando
2,The Dark Knight,2008,PG-13,152,"Action, Crime, Drama",9.0,84,2141633,534.86,Christopher Nolan,Christian Bale
3,The Godfather: Part II,1974,R,202,"Crime, Drama",9.0,90,1036159,57.3,Francis Ford Coppola,Al Pacino
4,The Lord of the Rings: The Return of the King,2003,PG-13,201,"Adventure, Drama, Fantasy",8.9,94,1538312,377.85,Peter Jackson,Elijah Wood


Some of the data is not clean. There are 5 movies in which the movie name was included in the Year scraping. These 5 instances have to be dealt with individually before we can convert the whole Year column in the movies DataFrame to integers. 

In [297]:
#Example
movies[movies['Movie']== 'Coco']

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead
61,Coco,I 2017,PG,105,"Animation, Adventure, Family",8.4,67,309244,448.14,Lee Unkrich,Anthony Gonzalez


In [298]:
#All instances of dirty Year Data
movies['Year'][movies['Year'].str.startswith('I')]

61     I 2017
126    I 2015
174    I 2015
181    I 2013
182    I 2015
Name: Year, dtype: object

In [299]:
movies['Year'] = movies['Year'].str.replace('I','')

In [300]:
movies['Year'] = pd.to_numeric(movies['Year'])              

Now, all the Year data is clean and converted to integers. Below is an example.

In [301]:
movies.loc[124]

Movie                           Green Book
Year                                  2018
Rating                               PG-13
Duration (min)                         130
Genres            Biography, Comedy, Drama
Stars                                  8.2
Metascore                               94
Votes                              265,884
Gross ($M)                            6.39
Director                    Peter Farrelly
Lead                       Viggo Mortensen
Name: 124, dtype: object

In [302]:
movies['Rating'] = movies['Rating'].str.replace('|','Not Rated')

In [303]:
movies["Duration (min)"] = movies["Duration (min)"].astype(int)
movies["Stars"] = movies["Stars"].astype(float)
movies['Metascore'] = movies['Metascore'].fillna(0)
movies['Votes'] = movies['Votes'].str.replace(',','')
movies["Votes"] = movies["Votes"].astype(int)
movies['Gross ($M)'] = movies['Gross ($M)'].fillna(0)
movies['Gross ($M)'] = movies['Gross ($M)'].astype(float)

In [304]:
movies

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead
0,The Shawshank Redemption,1994,R,142,Drama,9.3,80,2166683,28.34,Frank Darabont,Tim Robbins
1,The Godfather,1972,R,175,"Crime, Drama",9.2,100,1487716,134.97,Francis Ford Coppola,Marlon Brando
2,The Dark Knight,2008,PG-13,152,"Action, Crime, Drama",9.0,84,2141633,534.86,Christopher Nolan,Christian Bale
3,The Godfather: Part II,1974,R,202,"Crime, Drama",9.0,90,1036159,57.3,Francis Ford Coppola,Al Pacino
4,The Lord of the Rings: The Return of the King,2003,PG-13,201,"Adventure, Drama, Fantasy",8.9,94,1538312,377.85,Peter Jackson,Elijah Wood
5,Pulp Fiction,1994,R,154,"Crime, Drama",8.9,94,1701165,107.93,Quentin Tarantino,John Travolta
6,Schindler's List,1993,R,195,"Biography, Drama, History",8.9,94,1124311,96.9,Steven Spielberg,Liam Neeson
7,12 Angry Men,1957,Not Rated,96,Drama,8.9,96,620375,4.36,Sidney Lumet,Henry Fonda
8,Inception,2010,PG-13,148,"Action, Adventure, Sci-Fi",8.8,74,1899250,292.58,Christopher Nolan,Leonardo DiCaprio
9,Fight Club,1999,R,139,Drama,8.8,66,1731068,37.03,David Fincher,Brad Pitt


# Merging WebScraped Data with Actor Information Data #

Provided by https://github.com/DeathReaper0965/Movie_ratings_prediction.

In [305]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='white', color_codes=True, font_scale=1.25)

import os
directory = os.getcwd()

In [306]:
actors = pd.read_csv(directory + '/actors_mod0.csv', sep=',')

In [307]:
new_actors = actors.drop(['Unnamed: 11',
       'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14'], 1)

new_actors = new_actors.drop(['dow', 'bname', 'gname','dod','picture pointer', 'notes', 'award'],1)

In [308]:
new_actors.columns
new_actors = new_actors.rename(columns={'stage-name': "Lead", 'gender': 'Gender', 'dob':"DOB", 'role type':'Role Type'})
new_actors.head()

Unnamed: 0,Lead,Gender,DOB,Role Type
0,Willie Aames,M,1960,RU:
1,Bud Abbott,M,1895,R:straight
2,Diahnne Abbott,F,[1][23],R:sexy
3,George Abbott,M,1887,R:playwright
4,John Abbott,M,1905,R:staring eyes


In [309]:
df_merged = pd.merge(movies, new_actors, how='inner', on= 'Lead')

In [310]:
df_merged.head()

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead,Gender,DOB,Role Type
0,The Godfather,1972,R,175,"Crime, Drama",9.2,100,1487716,134.97,Francis Ford Coppola,Marlon Brando,M,1924,R:modern male
1,On the Waterfront,1954,Not Rated,108,"Crime, Drama, Thriller",8.1,97,132148,4.36,Elia Kazan,Marlon Brando,M,1924,R:modern male
2,The Godfather: Part II,1974,R,202,"Crime, Drama",9.0,90,1036159,57.3,Francis Ford Coppola,Al Pacino,M,1939,R:malevolent lead
3,Scarface,1983,R,170,"Crime, Drama",8.3,58,684938,309.13,Brian De Palma,Al Pacino,M,1939,R:malevolent lead
4,Heat,1995,R,170,"Crime, Drama, Thriller",8.2,96,537651,83.01,Michael Mann,Al Pacino,M,1939,R:malevolent lead



Transforming DOB column to contain all valid years, and creating a column to represent actors age at the time the movie was released.

In [312]:
#possible to make this process automated?
a = df_merged['DOB'].str.startswith('1')
b = pd.DataFrame(a)

In [314]:
df_merged['Lead'][b['DOB']== False]

9        Liam Neeson
27    Morgan Freeman
54        Jim Carrey
55        Jim Carrey
61    Matthew Modine
76        Bruno Ganz
96     Charlie Sheen
Name: Lead, dtype: object

In [315]:
df_merged.loc[9,'DOB'] = '1952'
df_merged.loc[27,'DOB'] = '1937'
df_merged.loc[54,'DOB'] = '1962'
df_merged.loc[55,'DOB'] = '1962'
df_merged.loc[61,'DOB'] = '1959'
df_merged.loc[76,'DOB'] = '1941'
df_merged.loc[96,'DOB'] = '1965'

In [316]:
df_merged['DOB'] = df_merged['DOB'].astype(float)
df_merged['Age'] = df_merged['Year'] - df_merged['DOB']
df_merged.head()

Unnamed: 0,Movie,Year,Rating,Duration (min),Genres,Stars,Metascore,Votes,Gross ($M),Director,Lead,Gender,DOB,Role Type,Age
0,The Godfather,1972,R,175,"Crime, Drama",9.2,100,1487716,134.97,Francis Ford Coppola,Marlon Brando,M,1924.0,R:modern male,48.0
1,On the Waterfront,1954,Not Rated,108,"Crime, Drama, Thriller",8.1,97,132148,4.36,Elia Kazan,Marlon Brando,M,1924.0,R:modern male,30.0
2,The Godfather: Part II,1974,R,202,"Crime, Drama",9.0,90,1036159,57.3,Francis Ford Coppola,Al Pacino,M,1939.0,R:malevolent lead,35.0
3,Scarface,1983,R,170,"Crime, Drama",8.3,58,684938,309.13,Brian De Palma,Al Pacino,M,1939.0,R:malevolent lead,44.0
4,Heat,1995,R,170,"Crime, Drama, Thriller",8.2,96,537651,83.01,Michael Mann,Al Pacino,M,1939.0,R:malevolent lead,56.0


In [280]:
#NEED TO REDO MERGE
#show work for merging? 
actors = pd.read_excel(directory + '/final_movies.xlsx')

In [281]:
movies = actors.copy()

In [286]:
movies.iloc[60]

Unnamed: 0                              60
Movie                       Reservoir Dogs
Year                                  1992
Rating                              99 min
Duration (min)    \nCrime, Drama, Thriller
Genres                                 NaN
Stars                                  8.3
Metascore                              NaN
Votes                              856,127
Gross ($M)                             NaN
Director                 Quentin Tarantino
Lead                         Harvey Keitel
Gender                                   M
DOB                                   1947
Role Type                              RU:
Age                                     45
Genre 1                                NaN
Genre 2                                NaN
Genre 3                                NaN
Name: 60, dtype: object

In [282]:
movies.columns

Index(['Unnamed: 0', 'Movie', 'Year', 'Rating', 'Duration (min)', 'Genres',
       'Stars', 'Metascore', 'Votes', 'Gross ($M)', 'Director', 'Lead',
       'Gender', 'DOB', 'Role Type', 'Age', 'Genre 1', 'Genre 2', 'Genre 3'],
      dtype='object')