In [23]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [24]:
movies = pd.read_csv('movie_metadata.csv')

## Top 200 Movies based on IMDB and No. of Reviews

In [10]:
IMDb_Top_200 = movies.sort_values(by = 'imdb_score', ascending = False)
IMDb_Top_200 = IMDb_Top_200.loc[IMDb_Top_200.num_voted_users > 25000]
IMDb_Top_200 = IMDb_Top_200.iloc[:200, ]
IMDb_Top_200['Rank'] = range(1,201)

pd.set_option('max_rows', 200)
IMDb_Top_200[['movie_title','director_name','gross', 'imdb_score']]

Unnamed: 0,movie_title,director_name,gross,imdb_score
1937,The Shawshank Redemption,Frank Darabont,28341469.0,9.3
3466,The Godfather,Francis Ford Coppola,134821952.0,9.2
66,The Dark Knight,Christopher Nolan,533316061.0,9.0
2837,The Godfather: Part II,Francis Ford Coppola,57300000.0,9.0
3481,Fargo,,,9.0
339,The Lord of the Rings: The Return of the King,Peter Jackson,377019252.0,8.9
4822,12 Angry Men,Sidney Lumet,,8.9
4498,"The Good, the Bad and the Ugly",Sergio Leone,6100000.0,8.9
3355,Pulp Fiction,Quentin Tarantino,107930000.0,8.9
1874,Schindler's List,Steven Spielberg,96067179.0,8.9


## Top Foreign Movies

In [13]:
def language(x):
    if x == 'English':
        return 'English'
    else:
        return 'Foreign'

# lets apply the function on the language column
movies['language'] = movies['language'].apply(language)

Top_Foreign_Lang_Film = IMDb_Top_250.loc[IMDb_Top_250['language'] != 'English']
Top_Foreign_Lang_Film[['movie_title','director_name','gross', 'imdb_score']]

Unnamed: 0,movie_title,director_name,gross,imdb_score
4498,"The Good, the Bad and the Ugly",Sergio Leone,6100000.0,8.9
4747,Seven Samurai,Akira Kurosawa,269061.0,8.7
4029,City of God,Fernando Meirelles,7563397.0,8.7
2373,Spirited Away,Hayao Miyazaki,10049886.0,8.6
4921,Children of Heaven,Majid Majidi,925402.0,8.5
3870,Airlift,Raja Menon,,8.5
4259,The Lives of Others,Florian Henckel von Donnersmarck,11284657.0,8.5
4105,Oldboy,Chan-wook Park,2181290.0,8.4
2970,Das Boot,Wolfgang Petersen,11433134.0,8.4
4659,A Separation,Asghar Farhadi,7098492.0,8.4


## Top 10 Directors

In [14]:
director = movies.pivot_table(values = 'imdb_score', index = 'director_name', aggfunc = 'mean')
director = director.sort_values(by = 'imdb_score', ascending = False)
director = director.iloc[:10, ]
director

Unnamed: 0_level_0,imdb_score
director_name,Unnamed: 1_level_1
John Blanchard,9.5
Sadyk Sher-Niyaz,8.7
Cary Bell,8.7
Mitchell Altieri,8.7
Mike Mayhall,8.6
Charles Chaplin,8.6
Ron Fricke,8.5
Majid Majidi,8.5
Raja Menon,8.5
Damien Chazelle,8.5


## Year wise Top 10 Movies

In [27]:
movies['title_year']

0       2009.0
1       2007.0
2       2015.0
3       2012.0
5       2012.0
         ...  
5037    2011.0
5038    2013.0
5040    2013.0
5041    2012.0
5042    2004.0
Name: title_year, Length: 4935, dtype: float64

In [26]:

movies = movies[~np.isnan(movies['title_year'])]
movies['title_year'].isnull().sum()


0

In [28]:
movies['title_year'] = movies['title_year'].astype('int')

In [35]:
# so lets remove all the duplicates from the data
movies.drop_duplicates(subset = None, keep = 'first', inplace = True)

In [37]:
# Lets convert the gross and budget from $ to Million $ to make our analysis easier

movies['gross'] = movies['gross']/1000000
movies['budget'] = movies['budget']/1000000

In [38]:
from ipywidgets import interact

@interact
def yearly_top_10_movies(year = movies['title_year'].values ):
    return movies[movies['title_year'] == year][['movie_title',
              'gross','imdb_score']].sort_values(by = 'gross', 
                     ascending = False).head(10).style.background_gradient(cmap = 'Greens')

interactive(children=(Dropdown(description='year', options=(2009, 2007, 2015, 2012, 2012, 2007, 2010, 2015, 20…