import libraries

In [1]:
import tmdbsimple as tmdb
import requests
import pandas as pd
import time
from ast import literal_eval
import imdb

Skip the next 2 cells, as they only need to be run once.

In [99]:
key = open('key.txt','r').read()
payload = "{}"

#this cell only needs to be run once
"""
movie_df = pd.DataFrame() #create empty dataframe to enable 'while loop' below

page=1
while movie_df.shape[0] < 1000:
    url = "https://api.themoviedb.org/3/movie/top_rated?api_key={0}&language=en-US&page={1}".format(key, str(page))
    response = requests.request("GET", url, data=payload).json()
    if page == 1: #initialize dataframe on first loop
        movie_df = pd.DataFrame(response['results'])
    else:
        movie_df = movie_df.append(pd.DataFrame(response['results']))
    
    movie_df = movie_df[movie_df['original_language']=='en'] #remove non english movies
    time.sleep(0.25) #rate limit is 4 pages per second
    page+=1
    
movie_df.reset_index(inplace=True,drop=True) #reset index since we dropped non english rows

#drop irrelevant columns for this analysis
dropCols = ['adult','backdrop_path', 'original_language','original_title', 'poster_path','video']

movie_df.drop(dropCols,axis=1,inplace=True)

"""

**IMDB Mapping**

In [84]:
def tmdb_to_imdb(tmdb_id):
    time.sleep(0.25) #rate limit is 4 requests per second
    url = "https://api.themoviedb.org/3/movie/{0}/external_ids?api_key={1}&language=en-US".format(tmdb_id, key)
    response = requests.request("GET", url, data=payload).json()
    if 'imdb_id' in response:
        return response['imdb_id']
    else:
        return None


In [119]:
#this cell only has to be run once
"""
movie_df['imdb_id'] = movie_df['id'].apply(lambda x: tmdb_to_imdb(x))
movie_df.to_csv('data\movie_df.csv',encoding="utf-8",index=False)
"""

In [2]:
movie_df = pd.read_csv('data\movie_df.csv',encoding="utf-8",converters={'genre_ids':literal_eval})

In [3]:
movie_df.head()

Unnamed: 0,genre_ids,id,overview,popularity,release_date,title,vote_average,vote_count,imdb_id
0,"[18, 80]",278,Framed in the 1940s for the double murder of h...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773,tt0111161
1,"[18, 80]",238,"Spanning the years 1945 to 1955, a chronicle o...",36.965452,1972-03-14,The Godfather,8.5,7394,tt0068646
2,"[18, 36, 10752]",424,The true story of how businessman Oskar Schind...,19.945455,1993-11-29,Schindler's List,8.4,5518,tt0108052
3,"[18, 80]",240,In the continuing saga of the Corleone crime f...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249,tt0071562
4,"[18, 9648]",452522,Standalone version of the series pilot with an...,5.969249,1989-12-31,Twin Peaks,8.4,123,tt0278784


Now we can use the imdb_id to get the imdb attributes for each movie

In [6]:
ia = imdb.IMDb()

In [81]:
godfather = ia.get_movie('0068646')
godfather

<Movie id:0068646[http] title:_The Godfather (1972)_>

In [82]:
godfather.keys()

['title',
 'year',
 'kind',
 'cast',
 'composers',
 'editorial department',
 'production managers ',
 'art department',
 'visual effects',
 'casting department',
 'costume departmen',
 'location management',
 'music department',
 'transportation department',
 'thanks',
 'genres',
 'runtimes',
 'countries',
 'country codes',
 'language codes',
 'color info',
 'aspect ratio',
 'sound mix',
 'certificates',
 'original air date',
 'rating',
 'votes',
 'cover url',
 'director',
 'writer',
 'producer',
 'cinematographer',
 'editor',
 'casting director',
 'production design',
 'art direction',
 'set decoration',
 'costume designer',
 'make up',
 'assistant director',
 'sound crew',
 'special effects companies',
 'stunt performer',
 'camera and electrical department',
 'miscellaneous crew',
 'plot outline',
 'languages',
 'akas',
 'top 250 rank',
 'plot',
 'synopsis',
 'canonical title',
 'long imdb title',
 'long imdb canonical title',
 'smart canonical title',
 'smart long imdb canonical tit

We are interested in getting the genre and the plot description. **get_movie** takes a long time to run, so we will want to store all of the results in a series, and then pull the genre and the plot description from this series.

It is faster to pull the information this way, because we are only using **get_movie** once, whereas using a traditional **.apply** method would have to use **get_movie** twice - once for genre and once for the plot description.

The reason we are storing all_imdb_data as a series outside of the dataframe is because the contents of **get_movie** break our dataframe for some reason, even after the column is dropped.

In [83]:
all_imdb_data = movie_df['imdb_id'].apply(lambda x: ia.get_movie(x[2:]))

In [90]:
movie_df['imdb_genres'] = all_imdb_data.apply(lambda x: x['genres'] if x.keys() != [] else None)
movie_df['imdb_plot'] = all_imdb_data.apply(lambda x: x['plot']if x.keys() != [] else None)

In [92]:
movie_df.head()

Unnamed: 0,genre_ids,id,overview,popularity,release_date,title,vote_average,vote_count,imdb_id,imdb_genres,imdb_plot
0,"[18, 80]",278,Framed in the 1940s for the double murder of h...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773,tt0111161,"[Crime, Drama]",[Chronicles the experiences of a formerly succ...
1,"[18, 80]",238,"Spanning the years 1945 to 1955, a chronicle o...",36.965452,1972-03-14,The Godfather,8.5,7394,tt0068646,"[Crime, Drama]",[When the aging head of a famous crime family ...
2,"[18, 36, 10752]",424,The true story of how businessman Oskar Schind...,19.945455,1993-11-29,Schindler's List,8.4,5518,tt0108052,"[Biography, Drama, History]",[Oskar Schindler is a vainglorious and greedy ...
3,"[18, 80]",240,In the continuing saga of the Corleone crime f...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249,tt0071562,"[Crime, Drama]",[The continuing saga of the Corleone crime fam...
4,"[18, 9648]",452522,Standalone version of the series pilot with an...,5.969249,1989-12-31,Twin Peaks,8.4,123,tt0278784,"[Crime, Drama, Mystery, Thriller]","[When beautiful, young Laura Palmer is found b..."


**Genres**

In [111]:
url = "https://api.themoviedb.org/3/genre/movie/list?api_key={0}&language=en-US".format(key)
response = requests.request("GET", url, data=payload).json()

id_to_genre = dict(zip([i['id'] for i in response['genres']],
                     [i['name'] for i in response['genres']]))

genre_to_id = dict(zip([i['name'] for i in response['genres']],
                       [i['id'] for i in response['genres']]))

id_to_genre

{12: 'Adventure',
 14: 'Fantasy',
 16: 'Animation',
 18: 'Drama',
 27: 'Horror',
 28: 'Action',
 35: 'Comedy',
 36: 'History',
 37: 'Western',
 53: 'Thriller',
 80: 'Crime',
 99: 'Documentary',
 878: 'Science Fiction',
 9648: 'Mystery',
 10402: 'Music',
 10749: 'Romance',
 10751: 'Family',
 10752: 'War',
 10770: 'TV Movie'}

In [136]:
def convert_genre_list(x):
    new_list = []
    for genre in x:
        if genre in ['Biography', 'Short', 'Film-Noir', 'Sport', #these genres don't exist in tmdb dataset
                    'Reality-TV']:
            continue
        elif genre=='Sci-Fi':
            genre='Science Fiction'
        elif genre=='Musical':
            genre='Music'
        new_list.append(genre_to_id[genre])
    return new_list

In [137]:
movie_df['imdb_genres'] = movie_df['imdb_genres'].apply(lambda x: convert_genre_list(x))

Change order of columns, rename columns for consistency

In [149]:
movie_df = movie_df[['id', 'imdb_id', 'genre_ids', 'imdb_genres','overview','imdb_plot', 
                    'popularity', 'release_date', 'title', 'vote_average', 'vote_count']]

movie_df.rename(columns={'id':'tmdb_id',
                        'genre_ids':'tmdb_genres',
                         'overview':'tmdb_plot'}, inplace=True)


In [150]:
movie_df.head()

Unnamed: 0,tmdb_id,imdb_id,tmdb_genres,imdb_genres,tmdb_plot,imdb_plot,popularity,release_date,title,vote_average,vote_count
0,278,tt0111161,"[18, 80]","[80, 18]",Framed in the 1940s for the double murder of h...,[Chronicles the experiences of a formerly succ...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773
1,238,tt0068646,"[18, 80]","[80, 18]","Spanning the years 1945 to 1955, a chronicle o...",[When the aging head of a famous crime family ...,36.965452,1972-03-14,The Godfather,8.5,7394
2,424,tt0108052,"[18, 36, 10752]","[18, 36]",The true story of how businessman Oskar Schind...,[Oskar Schindler is a vainglorious and greedy ...,19.945455,1993-11-29,Schindler's List,8.4,5518
3,240,tt0071562,"[18, 80]","[80, 18]",In the continuing saga of the Corleone crime f...,[The continuing saga of the Corleone crime fam...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249
4,452522,tt0278784,"[18, 9648]","[80, 18, 9648, 53]",Standalone version of the series pilot with an...,"[When beautiful, young Laura Palmer is found b...",5.969249,1989-12-31,Twin Peaks,8.4,123


In [152]:
movie_df.to_csv('data/movie_df_with_imdb.csv')