In [1]:
# Scrape thriller movies from IMDB.
# And do some Exploratory Data Analysis on it

import pandas as pd
from requests import get
from bs4 import BeautifulSoup

In [2]:
# Initiliaze a empty pandas dataframe
df = pd.DataFrame()

# By default the list is sorted on most votes
url = 'http://www.imdb.com/list/ls009668314/'

In [3]:
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
movie_list_div = html_soup.findAll("div", { "class" : "list detail" })

In [5]:
for each_movie in movie_list_div:
    for movie_details in each_movie.findAll("div", { "class" : "list_item" }):

        # get movie rank
        rank = int(movie_details.find("div", { "class" : "number" }).getText().split(".")[0])

        # get movie poster url
        image_url = movie_details.find("div", { "class" : "hover-over-image" }).find('img')['src']

        # get movie title and relased year
        movie_info = movie_details.find("div", { "class" : "info" })
        movie_title = movie_info.find('a').getText()
        movie_year = movie_info.find('span', { "class" : "year_type"}).getText()

        # get movie rating
        movie_rating = movie_info.find('span', { "class" : "value"}).getText()

        # get movie votes
        movie_votes = movie_info.find('div', { "class" : "rating"}).attrs
        movie_votes = movie_votes['title'].replace("votes", "")
        movie_votes = movie_votes.replace(",", "")
        movie_votes = int(movie_votes[movie_votes.index("(") + 1:movie_votes.rindex(")")])

        # get movie description
        movie_description = movie_info.find('div', { "class" : "item_description"}).getText()

        # get movie director
        movie_people_info = movie_info.findAll('div', { "class" : "secondary"})
        movie_director = movie_people_info[0].getText().split("Director:")[1]

        # get movie cast
        movie_cast = movie_people_info[1].getText()
        movie_cast = movie_cast.split("Stars:")[1]

        # get protagonist from movie cast
        movie_hero = movie_cast.split(",")[0]

        df = df.append({'rank': rank,
                        'image_url': image_url,
                        'movie_title': movie_title,
                        'movie_year': movie_year,
                        'movie_votes': movie_votes,
                        'movie_rating': movie_rating,
                        'movie_description': movie_description,
                        'movie_director': movie_director,
                        'movie_cast': movie_cast,
                        'movie_hero': movie_hero
                        }, ignore_index=True)


In [6]:
df.head(2)
df['rank'] = df['rank'].astype('int')

In [7]:
df.columns

Index([u'image_url', u'movie_cast', u'movie_description', u'movie_director',
       u'movie_hero', u'movie_rating', u'movie_title', u'movie_votes',
       u'movie_year', u'rank'],
      dtype='object')

In [8]:
# Let's see which year has the most movies in the list
movies_by_year = df['movie_year'].value_counts().sort_index(ascending=False)
print movies_by_year

(2016)    2
(2015)    4
(2014)    3
(2013)    4
(2012)    1
(2011)    2
(2010)    6
(2009)    2
(2008)    2
(2007)    7
(2006)    9
(2005)    5
(2004)    4
(2003)    4
(2002)    5
(2001)    2
(2000)    2
(1999)    4
(1998)    2
(1997)    5
(1996)    2
(1995)    5
(1994)    2
(1993)    2
(1991)    3
(1988)    1
(1987)    1
(1986)    1
(1982)    1
(1975)    1
(1974)    1
(1971)    1
(1960)    1
(1959)    1
(1958)    1
(1954)    1
Name: movie_year, dtype: int64


In [9]:
print movies_by_year.idxmax(), movies_by_year.max()
# So year 2009 has 9 movies which is maximum in the list.

(2006) 9


In [10]:
# Let's see how many movies were in the top 10 ranks, only one movie
# Infact only one movie is within top 20
df[df['movie_year'] == '(2006)'][['movie_title', 'rank']]

Unnamed: 0,movie_title,rank
2,The Departed,3
24,The Prestige,25
30,Lucky Number Slevin,31
35,Apocalypto,36
47,Casino Royale,48
62,Children of Men,63
65,The Lives of Others,66
74,The Unknown Woman,75
81,Blood Diamond,82


In [11]:
# Now over to see which director's has most number of movies in this list
directors_list = df['movie_director'].value_counts()

# Since there 100 movies in the list and so many directors with only one movie in the list.
# So we will filter movies with frequency count greater than 2.
directors_list[directors_list >= 2]

# Christopher Nolan is at first place, kind of expected result.
# Second and Third are David Fincher, Quentin Tarantino
# Fourth is Steven Spielberg
# Fifth is Alfred Hitchcock 

 Christopher Nolan    6
 David Fincher        6
 Quentin Tarantino    5
 Steven Spielberg     5
 Alfred Hitchcock     4
 David Lynch          3
 Martin Scorsese      3
 Paul Greengrass      2
 Alfonso CuarÃ³n      2
 Roman Polanski       2
 Paul Haggis          2
 Denis Villeneuve     2
 Brian De Palma       2
 Michael Mann         2
Name: movie_director, dtype: int64

In [12]:
# So let's see each of these directors movies 
for director in directors_list[directors_list >= 2].index[:5]:
    print "==================================================="
    print director
    sub_df = df[df['movie_director'] == director]
    for idx, row in sub_df.iterrows():
        print row['movie_title'], row['movie_year']
    print "==================================================="

 Christopher Nolan
The Dark Knight (2008)
Inception (2010)
The Dark Knight Rises (2012)
Insomnia (2002)
The Prestige (2006)
Memento (2000)
 David Fincher
Se7en (1995)
The Game (1997)
The Girl with the Dragon Tattoo (2011)
Gone Girl (2014)
Panic Room (2002)
Zodiac (2007)
 Quentin Tarantino
Kill Bill: Vol. 1 (2003)
Kill Bill: Vol. 2 (2004)
The Hateful Eight (2015)
Jackie Brown (1997)
Death Proof (2007)
 Steven Spielberg
Minority Report (2002)
Jaws (1975)
Munich (2005)
Jurassic Park (1993)
Bridge of Spies (2015)
 Alfred Hitchcock
North by Northwest (1959)
Rear Window (1954)
Psycho (1960)
Vertigo (1958)


In [13]:
# Ok now let's use the same technique for main protoganist
protoganist_list = df['movie_hero'].value_counts()
protoganist_list[protoganist_list >= 2]

 Leonardo DiCaprio    5
 Tom Cruise           4
 Christian Bale       3
 Kevin Costner        3
 Bruce Willis         3
 Tom Hanks            3
 Jake Gyllenhaal      3
 Robert De Niro       2
 Jodie Foster         2
 James Stewart        2
 Harrison Ford        2
 Al Pacino            2
 Daniel Craig         2
 Uma Thurman          2
Name: movie_hero, dtype: int64

In [14]:
# So let's see the movie these guys acted in
for actor in protoganist_list[protoganist_list >= 2].index[:14]:
    print "==================================================="
    print actor
    sub_df = df[df['movie_hero'] == actor]
    for idx, row in sub_df.iterrows():
        print row['movie_title'], row['movie_year']
    print "==================================================="

 Leonardo DiCaprio
Inception (2010)
The Departed (2006)
Shutter Island (2010)
The Revenant (2015)
Blood Diamond (2006)
 Tom Cruise
Minority Report (2002)
Collateral (2004)
Eyes Wide Shut (1999)
Mission: Impossible (1996)
 Christian Bale
The Dark Knight (2008)
The Dark Knight Rises (2012)
The Prestige (2006)
 Kevin Costner
The Untouchables (1987)
JFK (1991)
Mr. Brooks (2007)
 Bruce Willis
The Sixth Sense (1999)
Twelve Monkeys (1995)
Die Hard (1988)
 Tom Hanks
Captain Phillips (2013)
Road to Perdition (2002)
Bridge of Spies (2015)
 Jake Gyllenhaal
Source Code (2011)
Nightcrawler (2014)
Zodiac (2007)
 Robert De Niro
Ronin (1998)
Cape Fear (1991)
 Jodie Foster
The Silence of the Lambs (1991)
Panic Room (2002)
 James Stewart
Rear Window (1954)
Vertigo (1958)
 Harrison Ford
The Fugitive (1993)
Blade Runner (1982)
 Al Pacino
Heat (1995)
Insomnia (2002)
 Daniel Craig
The Girl with the Dragon Tattoo (2011)
Casino Royale (2006)
 Uma Thurman
Kill Bill: Vol. 1 (2003)
Kill Bill: Vol. 2 (2004)


In [15]:
# Now we will see the winning combination
# How may times the director has came up with the main protoganist
# to do this we need to get the list of heroes whose movies has occured more than two times
# filter them out from main data and see their frequency with the directors
win_comb = df[df['movie_hero'].isin(protoganist_list[protoganist_list > 2].index[:14])]
win_comb_df = pd.crosstab(win_comb.movie_director, win_comb.movie_hero)

In [16]:
import seaborn as sns

cm = sns.light_palette("green", as_cmap=True)

s = win_comb_df.style.background_gradient(cmap=cm)

In [17]:
s

movie_hero,Bruce Willis,Christian Bale,Jake Gyllenhaal,Kevin Costner,Leonardo DiCaprio,Tom Cruise,Tom Hanks
movie_director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alejandro G. Iñárritu,0,0,0,0,1,0,0
Brian De Palma,0,0,0,1,0,1,0
Bruce A. Evans,0,0,0,1,0,0,0
Christopher Nolan,0,3,0,0,1,0,0
Dan Gilroy,0,0,1,0,0,0,0
David Fincher,0,0,1,0,0,0,0
Duncan Jones,0,0,1,0,0,0,0
Edward Zwick,0,0,0,0,1,0,0
John McTiernan,1,0,0,0,0,0,0
M. Night Shyamalan,1,0,0,0,0,0,0


In [18]:
# Seems very clearly Christian Bale and Christopher Nolan are the best combo

In [19]:
# Finally will see the top movies by the no. of votes
votes_df = df.sort_values('movie_votes', ascending=False).head(10)
votes_df['movie_title']

0              The Dark Knight
1                    Inception
12       The Dark Knight Rises
11                       Se7en
14    The Silence of the Lambs
2                 The Departed
24                The Prestige
45                     Memento
16              Shutter Island
36              V for Vendetta
Name: movie_title, dtype: object

In [20]:
# Write to a csv file if you want to have track of movie that you need to watch
# df.to_csv("imdb_thriller_movies_list.csv", index=False, encoding="utf-8")