In [56]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
def status_check(url):
    import requests
    r=requests.get(url)
    if r.status_code<300:
        print("request was successful")
    elif r.status_code>=400 and r.status_code<500:
        print('request failed because it is non-existant or forbidden')
    else: print('blablabla')
    return r

In [3]:
def get_html(url):
    r=status_check(url)
    html = BeautifulSoup(r.text, 'html.parser')
    return html

In [5]:
url_pattern='https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2020-12-31&user_rating=1.0,10.0&sort=num_votes,desc&count=250&start={}'

## looping all pages

there are 96560 titles total, 250 titles per page
but after 10000, it goes back to the first page, so we can only get 10000 movies

In [32]:
pages = np.arange(1, 10000, 250) 

In [None]:
titles = []
imdbIDs = []
years = []
certificates = []
genres = []
runtimes = []
ratings = []
metascores = []
directors = []
votes = []
grossings = []

for page in pages:
    print(page)
    url = url_pattern.format(page)
    page_html = get_html(url)
    movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
    
    for container in movie_containers:
        
        #imdbID
        ID = container.a['href']
        imdbIDs.append(ID)
    
        #title
        title = container.h3.a.text
        titles.append(title)

        #year
        year = container.h3.find('span', class_= 'lister-item-year').text
        years.append(year)

        #certificate
        if container.p.find('span', class_="certificate") is not None:
            certif = container.p.find('span', class_="certificate").text
            certificates.append(certif)
        else:
            certificates.append(None)

        #genre
        if container.p.find('span', class_ = 'genre') is not None:
            genre = container.p.find('span', class_ = 'genre').text
            genres.append(genre)
        else:
            genres.append(None)

        #runtime
        if container.p.find('span', class_ = 'runtime') is not None:
            runtime = container.p.find('span', class_ = 'runtime').text
            runtimes.append(runtime)
        else:
            runtimes.append(None)

        #rating
        if container.strong is not None:
            rating = float(container.strong.text)
            ratings.append(rating)
        else:
            ratings.append(None)

        #metascore
        if container.find('span', class_ = 'metascore') is not None:
            score= int(container.find('span', class_ = 'metascore').text)
            metascores.append(score)
        else:
            metascores.append(None)

        #director
        if len(container.select('p:contains("Director")>a:first-child'))>0:
            director= container.select('p:contains("Director")>a:first-child')[0].text
            directors.append(director)    
        else:
            directors.append(None)

        #number of votes
        if len(container.select('span:contains("Votes")+span'))>0:
            vote = container.select('span:contains("Votes")+span')[0].text
            votes.append(vote)
        else:
            votes.append(None)

        #gross
        if len(container.select('span:contains("Gross")+span'))>0:
            gross = container.select('span:contains("Gross")+span')[0].text
            grossings.append(gross)
        else:
            grossings.append(None)
            
    time.sleep(2)

In [34]:
df = pd.DataFrame({'movie': titles,
                       'year': years,
                       'imdbID': imdbIDs,
                       'certificate': certificates,
                       'genre': genres,
                       'runtime': runtimes,
                       'rating': ratings,
                       'metascore': metascores,
                       'director': directors,
                       'votes': votes,
                      'gross':grossings})

In [35]:
df.head()

Unnamed: 0,movie,year,imdbID,certificate,genre,runtime,rating,metascore,director,votes,gross
0,The Dark Knight: Le chevalier noir,(2008),/title/tt0468569/,Tous publics,"\nAction, Crime, Drama",152 min,9.0,84.0,Christopher Nolan,2161024,$534.86M
1,Inception,(2010),/title/tt1375666/,Tous publics,"\nAction, Adventure, Sci-Fi",148 min,8.8,74.0,Christopher Nolan,1909234,$292.58M
2,Le seigneur des anneaux: La communauté de l'an...,(2001),/title/tt0120737/,Tous publics,"\nAdventure, Drama, Fantasy",178 min,8.8,92.0,Peter Jackson,1561138,$315.54M
3,Le seigneur des anneaux: Le retour du roi,(2003),/title/tt0167260/,Tous publics,"\nAdventure, Drama, Fantasy",201 min,8.9,94.0,Peter Jackson,1549252,$377.85M
4,The Dark Knight Rises,(2012),/title/tt1345836/,Tous publics,"\nAction, Thriller",164 min,8.4,78.0,Christopher Nolan,1431808,$448.14M


In [36]:
df.to_csv('movies_imdb_messy.csv', index=False)