In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imdb import IMDb

In [3]:
ia = IMDb()    #initializes IMDbPy with ia as an alias

In [4]:
ia.get_movie_infoset()   #pulling out the list of categories of information that's available via IMDbPy

['airing',
 'akas',
 'alternate versions',
 'awards',
 'connections',
 'crazy credits',
 'critic reviews',
 'episodes',
 'external reviews',
 'external sites',
 'faqs',
 'full credits',
 'goofs',
 'keywords',
 'list',
 'locations',
 'main',
 'misc sites',
 'news',
 'official sites',
 'parents guide',
 'photo sites',
 'plot',
 'quotes',
 'recommendations',
 'release dates',
 'release info',
 'reviews',
 'sound clips',
 'soundtrack',
 'synopsis',
 'taglines',
 'technical',
 'trivia',
 'tv schedule',
 'video clips',
 'vote details']

In [5]:
movie = ia.get_movie('2395427', info=['main'])    #pulls info for The Avengers: Age of Ultron to test

In [6]:
movie.infoset2keys       #displays the information contained in the 'main' key for the movie

{'main': ['localized title',
  'cast',
  'genres',
  'runtimes',
  'countries',
  'country codes',
  'language codes',
  'color info',
  'aspect ratio',
  'sound mix',
  'box office',
  'certificates',
  'original air date',
  'rating',
  'votes',
  'cover url',
  'imdbID',
  'plot outline',
  'languages',
  'title',
  'year',
  'kind',
  'directors',
  'writers',
  'producers',
  'composers',
  'cinematographers',
  'editors',
  'editorial department',
  'casting directors',
  'production designers',
  'art directors',
  'set decorators',
  'costume designers',
  'make up department',
  'production managers',
  'assistant directors',
  'art department',
  'sound department',
  'special effects',
  'visual effects',
  'stunts',
  'camera department',
  'animation department',
  'casting department',
  'costume departmen',
  'location management',
  'music department',
  'script department',
  'transportation department',
  'miscellaneous',
  'thanks',
  'akas',
  'writer',
  'director'

In [7]:
MovieBox = movie.get('box office')  #stores object under dict[main][box office]
MovieRev = movie.get('box office')['Cumulative Worldwide Gross']   #stores cumulative worldwide gross revenue

In [8]:
MovieBox
#Getting just the 'box office' information returns a dictionary of all included revenue values

{'Budget': '$250,000,000 (estimated)',
 'Opening Weekend United States': '$191,271,109, 01 May 2015',
 'Cumulative Worldwide Gross': '$1,405,413,868'}

In [9]:
MovieRev
#calling the .get() on 'box office' and also the dictionary key for 'Cumulative Worldwide Gross' returns the target value needed

'$1,405,413,868'

In [10]:
MovieRev = MovieRev.replace('$', '').replace(',','')
MovieRev = int(MovieRev)
print(type(MovieRev))
print(MovieRev)
#using the string replace method on the target value strips all '/$'' and ',' from the revenue numbers - this will work!

<class 'int'>
1405413868


### The next step here is to tinker and discover how I can do this in 'one go' for my entire list of imdb id's

Now I need to load in my boxoffice_data file and pull out the imdb_id and see what I can do to pull our box office revenue information



In [11]:
boxoffice = pd.read_csv(r'C:\Users\deann\Documents\Data\Box Office Prediction Data\boxoffice_data.csv')
boxoffice_2 = pd.read_csv(r"C:\Users\deann\Documents\Data\Box Office Prediction Data\test.csv")

In [12]:
boxoffice.imdb_id.head()   #since I stripped the 'tt' portion of the ID off pandas read the ID's in as integers

0    2637294
1     368933
2    2582802
3    1821480
4    1380152
Name: imdb_id, dtype: int64

In [13]:
boxoffice_2.imdb_id = boxoffice_2.imdb_id.str.replace('tt', '')
boxoffice_2.imdb_id.head()

0    1226251
1    0051380
2    0118556
3    1255953
4    0418753
Name: imdb_id, dtype: object

In [14]:
values = boxoffice.imdb_id.values
print(type(values))

<class 'numpy.ndarray'>


In [15]:
#  movie_info = ia.get_movie(values, info=['main'])      #Okay, here I see that I need to create a function that passes each value


In [16]:

def movie_info(x):
    """Creates a simple function that gets the movie info I need for a single film
        I plan to vectorize this over a numpy array of IMDB values"""
    error_list = []
    error_count = 0
    try: 
        # storing the main dictionary for the film with imdb_id = x
        info = ia.get_movie(x, info=['main'])
        # trying to return the cumulative worldwide gross revenue for film with imdb_id = x
        MovieRev = info.get('box office')['Opening Weekend United States']
        # transforming the string value to a clean integer            
        MovieRev = MovieRev.replace('$', '').replace(',','')
        MovieRev = int(MovieRev)
        # returns the integer value for the cumulative worldwide gross revenue for film with imdb_id = x
        return MovieRev     
    except:
        error_list.append(str(x))
        error_count += 1
        return np.nan
    print(error_count)

In [17]:
#testing the function on the same Age of Ultron film ID I used above for both string and integer inputs

UltronRev = movie_info('2395427')
UltronRev

nan

In [18]:
apply = boxoffice.imdb_id.apply(movie_info)

apply

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
2995   NaN
2996   NaN
2997   NaN
2998   NaN
2999   NaN
Name: imdb_id, Length: 3000, dtype: float64

Action items:

1/) Search for this api in github

2/) search documentation again, this SHOULD be in their website

In [19]:
none_ap = apply.replace('NaN', np.nan)
print(none_ap)

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
2995   NaN
2996   NaN
2997   NaN
2998   NaN
2999   NaN
Name: imdb_id, Length: 3000, dtype: float64


In [20]:
missing_train=none_ap.isnull().sum()    # checking to see how many nully values were pulled from IMDB
print('You have ' + str(missing_train) + ' unusable films in your training dataset')

You have 2988 unusable films in your training dataset


In [21]:
apply_test = boxoffice_2.imdb_id.apply(movie_info)

apply_test

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
4393   NaN
4394   NaN
4395   NaN
4396   NaN
4397   NaN
Name: imdb_id, Length: 4398, dtype: float64

In [22]:
none_test = apply_test.replace('NaN', np.nan)
print(none_test)

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
4393   NaN
4394   NaN
4395   NaN
4396   NaN
4397   NaN
Name: imdb_id, Length: 4398, dtype: float64


In [23]:
missing_test=none_test.isnull().sum()
print('You have ' + str(missing_test) + ' unusable films in your test dataset')

You have 4374 unusable films in your test dataset


In [24]:
(4398-3004) + (3000-2062)    #Total films for which I have revenue information

2332

Here I plan to combine the revenue values back into the original boxoffice data frames

In [25]:
boxoffice.revenue = apply

boxoffice.head()  #adding revenue to the training data

Unnamed: 0,belongs_to_collection,budget,genres,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,...,spoken_languages,tagline,title,Keywords,cast,crew,revenue,Overview_length,Tag_length,collectionbool
0,Hot Tub Time Machine Collection,14000000,['Comedy'],2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,"['Paramount Pictures', 'United Artists', 'Metr...",['United States of America'],...,['English'],The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"['time travel', 'sequel', 'hot tub', 'duringcr...","['Rob Corddry', 'Craig Robinson', 'Clark Duke'...","['Kelly Cantley', 'Steve Pink', 'Josh Heald', ...",,155.0,52.0,True
1,The Princess Diaries Collection,40000000,"['Comedy', 'Drama', 'Family', 'Romance']",368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,['Walt Disney Pictures'],['United States of America'],...,['English'],It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"['coronation', 'duty', 'marriage', 'falling in...","['Anne Hathaway', 'Julie Andrews', 'H√©ctor El...","['Garry Marshall', 'Charles Minsky', 'John Deb...",,393.0,60.0,True
2,,3300000,['Drama'],2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,"['Bold Films', 'Blumhouse Productions', 'Right...",['United States of America'],...,['English'],The road to greatness can take you to the edge.,Whiplash,"['jazz', 'obsession', 'conservatory', 'music t...","['Miles Teller', 'J.K. Simmons', 'Melissa Beno...","['Terri Taylor', 'Richard Henderson', 'Jeffrey...",,130.0,47.0,False
3,,1200000,"['Thriller', 'Drama']",1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,,['India'],...,"['English', 'हिन्दी']",,Kahaani,"['mystery', 'bollywood', 'police corruption', ...","['Vidya Balan', 'Nawazuddin Siddiqui', 'Paramb...","['Sujoy Ghosh', 'Sujoy Ghosh', 'Sujoy Ghosh']",,581.0,,False
4,,0,"['Action', 'Thriller']",1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,,['South Korea'],...,['한국어/조선말'],,Marine Boy,,"['Kim Kang-woo', 'Jo Jae-hyeon', 'Park Si-yeon...","['Jong-seok Yoon', 'Jong-seok Yoon']",,168.0,,False


In [26]:
boxoffice_2['revenue'] = apply_test

boxoffice_2.head()     # Adding revenue to the test data

Unnamed: 0.1,Unnamed: 0,belongs_to_collection,budget,genres,imdb_id,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,tagline,title,Keywords,cast,crew,Overview_length,Tag_length,revenue
0,0,Pokémon Collection,0,"['Adventure', 'Animation', 'Family', 'Fantasy']",1226251,ja,ディアルガVSパルキアVSダークライ,Ash and friends (this time accompanied by newc...,3.851534,,...,90.0,"['English', '日本語']",Somewhere Between Time & Space... A Legend Is ...,Pokémon: The Rise of Darkrai,"['pok√©mon', 'pocket monsters']","['Craig Blair', 'Emily Bauer', 'Sarah Natochen...","['Kunihiko Yuyama', 'Hideki Sonoda']",335.0,51.0,
1,1,,88000,"['Horror', 'Science Fiction']",51380,en,Attack of the 50 Foot Woman,When an abused wife grows to giant size becaus...,3.559789,['Woolner Brothers Pictures Inc.'],...,65.0,['English'],A titanic beauty spreads a macabre wave of hor...,Attack of the 50 Foot Woman,"['revenge', 'alien', 'b movie', 'cheating husb...","['Allison Hayes', 'William Hudson', 'Yvette Vi...","['Edward Mann', 'Jacques R. Marquette', 'Jacqu...",162.0,96.0,
2,2,,0,"['Comedy', 'Romance']",118556,en,Addicted to Love,Good-natured astronomer Sam is devastated when...,8.085194,"['Warner Bros.', 'Outlaw Productions (I)', 'Mi...",...,100.0,['English'],A Comedy About Lost Loves And Last Laughs,Addicted to Love,"['jealousy', 'love', 'revenge', 'break-up']","['Meg Ryan', 'Matthew Broderick', 'Kelly Prest...","['Griffin Dunne', 'Robert F. Newmyer', 'Jeffre...",362.0,41.0,
3,3,,6800000,"['Drama', 'War', 'Mystery']",1255953,fr,Incendies,A mother's last wishes send twins Jeanne and S...,8.596012,"['TS Productions', 'Micro scope', 'Phi Group']",...,130.0,"['Français', 'العربية', 'English']",The search began at the opening of their mothe...,Incendies,"['prison', 'middle east', 'rape', 'muslim', 'm...","['Lubna Azabal', 'M√©lissa D√©sormeaux-Poulin'...","['Louis Craig', 'Lucie Robitaille', 'Gilles Sa...",302.0,55.0,
4,4,,2000000,"['History', 'Documentary']",418753,en,Inside Deep Throat,"In 1972, a seemingly typical shoestring budget...",3.21768,,...,92.0,['English'],It was filmed in 6 days for 25 thousand dollar...,Inside Deep Throat,"['usa', '1970s', 'sexual revolution', 'unsimul...","['Dennis Hopper', 'Peter Bart', 'Warren Beatty...","['Brian Grazer', 'Kim Roth', 'Ron Howard', 'Fe...",894.0,221.0,


In [27]:
BoxOffice = pd.concat([boxoffice, boxoffice_2], ignore_index=True)

BoxOffice.head()      # concatenating both dataframes and resetting the index

Unnamed: 0.1,belongs_to_collection,budget,genres,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,...,tagline,title,Keywords,cast,crew,revenue,Overview_length,Tag_length,collectionbool,Unnamed: 0
0,Hot Tub Time Machine Collection,14000000,['Comedy'],2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,"['Paramount Pictures', 'United Artists', 'Metr...",['United States of America'],...,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"['time travel', 'sequel', 'hot tub', 'duringcr...","['Rob Corddry', 'Craig Robinson', 'Clark Duke'...","['Kelly Cantley', 'Steve Pink', 'Josh Heald', ...",,155.0,52.0,True,
1,The Princess Diaries Collection,40000000,"['Comedy', 'Drama', 'Family', 'Romance']",368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,['Walt Disney Pictures'],['United States of America'],...,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"['coronation', 'duty', 'marriage', 'falling in...","['Anne Hathaway', 'Julie Andrews', 'H√©ctor El...","['Garry Marshall', 'Charles Minsky', 'John Deb...",,393.0,60.0,True,
2,,3300000,['Drama'],2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,"['Bold Films', 'Blumhouse Productions', 'Right...",['United States of America'],...,The road to greatness can take you to the edge.,Whiplash,"['jazz', 'obsession', 'conservatory', 'music t...","['Miles Teller', 'J.K. Simmons', 'Melissa Beno...","['Terri Taylor', 'Richard Henderson', 'Jeffrey...",,130.0,47.0,False,
3,,1200000,"['Thriller', 'Drama']",1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,,['India'],...,,Kahaani,"['mystery', 'bollywood', 'police corruption', ...","['Vidya Balan', 'Nawazuddin Siddiqui', 'Paramb...","['Sujoy Ghosh', 'Sujoy Ghosh', 'Sujoy Ghosh']",,581.0,,False,
4,,0,"['Action', 'Thriller']",1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,,['South Korea'],...,,Marine Boy,,"['Kim Kang-woo', 'Jo Jae-hyeon', 'Park Si-yeon...","['Jong-seok Yoon', 'Jong-seok Yoon']",,168.0,,False,


In [28]:
BoxOffice.revenue.replace('NaN', None)     #replacing all "NaN" strings for revenue with null values


0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
7393   NaN
7394   NaN
7395   NaN
7396   NaN
7397   NaN
Name: revenue, Length: 7398, dtype: float64

In [33]:
# Creating a boolean mask for which films ahve revenue values

missing = BoxOffice.revenue.isnull()

print(missing)

49      False
538     False
932     False
951     False
983     False
1449    False
1620    False
2011    False
2697    False
2841    False
2898    False
2906    False
3067    False
3162    False
3222    False
3223    False
3297    False
3614    False
3625    False
3798    False
3951    False
4064    False
4748    False
4812    False
4943    False
5124    False
5784    False
5854    False
6104    False
6317    False
6365    False
6456    False
7051    False
7232    False
7235    False
7344    False
Name: revenue, dtype: bool


In [30]:
# Dropping all films which have no value for their revenue
# I tried to rerun this function collecting opening weekend revenue rather than cumulative, however there are only 36 films
# for which that information is available. 

BoxOffice.drop(BoxOffice[missing].index, inplace=True)

BoxOffice

Unnamed: 0.1,belongs_to_collection,budget,genres,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,...,tagline,title,Keywords,cast,crew,revenue,Overview_length,Tag_length,collectionbool,Unnamed: 0
49,Star Trek: The Next Generation Collection,38000000,"['Science Fiction', 'Action', 'Adventure', 'Th...",111280,en,Star Trek: Generations,Captain Jean-Luc Picard and the crew of the En...,8.105708,['Paramount Pictures'],['United States of America'],...,Boldly go.,Star Trek: Generations,"['based on tv series', 'death', 'exploding pla...","['Patrick Stewart', 'Jonathan Frakes', 'Brent ...","['David Carson', 'Rick Berman', 'Peter Laurits...",23100000.0,248.0,10.0,True,
538,Indiana Jones Collection,48000000,"['Adventure', 'Action']",97576,en,Indiana Jones and the Last Crusade,When Dr. Henry Jones Sr. suddenly goes missing...,14.788987,"['Lucasfilm', 'Paramount Pictures']",['United States of America'],...,"The man with the hat is back. And this time, h...",Indiana Jones and the Last Crusade,"['saving the world', 'venice', 'holy grail', '...","['Harrison Ford', 'Sean Connery', 'Denholm Ell...","['George Lucas', 'George Lucas', 'George Lucas...",29400000.0,263.0,67.0,True,
932,,57000000,"['Crime', 'Action', 'Science Fiction']",106697,en,Demolition Man,"Simon Phoenix, a violent criminal cryogenicall...",11.626116,"['Silver Pictures', 'Warner Bros.']",['United States of America'],...,The 21st Century's most dangerous cop. The 21s...,Demolition Man,"['helicopter', 'martial arts', 'crime fighter'...","['Sylvester Stallone', 'Wesley Snipes', 'Sandr...","['David L. Snyder', 'Howard G. Kazanjian', 'Da...",14200000.0,350.0,81.0,False,
951,,25530000,"['Adventure', 'Family']",116322,en,Flipper,"Sandy Ricks is sent by his mom to Coral Key, a...",4.557006,"['Universal Pictures', 'The Bubble Factory', '...",['United States of America'],...,This summer it's finally safe to go back in th...,Flipper,"['dolphin', 'florida', 'florida keys', 'summer']","['Elijah Wood', 'Paul Hogan', 'Jonathan Banks'...","['Alan Shapiro', 'Alan Shapiro']",4200000.0,340.0,54.0,False,
983,,26000000,"['Drama', 'Horror', 'Science Fiction', 'Thrill...",99582,en,Flatliners,Five medical students want to find out if ther...,11.890138,"['Columbia Pictures Corporation', 'Stonebridge...",['United States of America'],...,Some lines shouldn't be crossed.,Flatliners,"['life and death', 'afterlife', 'swing', 'memo...","['Kiefer Sutherland', 'Julia Roberts', 'Kevin ...","['Joel Schumacher', 'Peter Filardi', 'Ve Neill...",10000000.0,263.0,32.0,False,
1449,,0,"['Fantasy', 'Horror', 'Thriller']",105428,en,Sleepwalkers,A mother-and-son team of strange supernatural ...,5.695003,"['Columbia Pictures', 'Victor & Grais Producti...",['United States of America'],...,They feast on your fear - and it's dinner time.,Sleepwalkers,"['killing', 'based on novel', 'black humor', '...","['Brian Krause', 'M√§dchen Amick', 'Alice Krig...","['Stephen King', 'Michael Grais', 'Mark Victor...",10000000.0,101.0,47.0,False,
1620,,55000000,"['Action', 'Adventure', 'Drama', 'Romance']",113071,en,First Knight,The timeless tale of King Arthur and the legen...,15.178338,"['Columbia Pictures Corporation', 'First Knigh...",['United States of America'],...,Their greatest battle would be for her love.,First Knight,"['camelot', 'knight', 'king arthur', 'excalibu...","['Sean Connery', 'Richard Gere', 'Julia Ormond...","['Jerry Zucker', 'Lorne Cameron', 'David Hosel...",10900000.0,329.0,44.0,False,
2011,Star Trek: The Original Series Collection,30000000,"['Science Fiction', 'Action', 'Adventure', 'Th...",98382,en,Star Trek V: The Final Frontier,Capt. Kirk and his crew must deal with Mr. Spo...,12.308007,['Paramount Pictures'],['United States of America'],...,Adventure and imagination will meet at the fin...,Star Trek V: The Final Frontier,"['federation', 'starfleet', 'uss enterprise-a'...","['William Shatner', 'Leonard Nimoy', 'DeForest...","['William Shatner', 'Harve Bennett', 'David Lo...",17300000.0,151.0,58.0,True,
2697,,2000000,['Science Fiction'],80421,en,Battle Beyond the Stars,A young farmer assembles a band of diverse mer...,2.921405,['New World Pictures'],['United States of America'],...,"A battle beyond time, beyond space.",Battle Beyond the Stars,"['clone', 'hitman', 'outer space', 'robot', 'e...","['Richard Thomas', 'Robert Vaughn', 'John Saxo...","['Jimmy T. Murakami', 'John Sayles', 'James Ho...",1700000.0,105.0,35.0,False,
2841,48 Hrs. Collection,38000000,"['Thriller', 'Action', 'Comedy', 'Crime', 'Dra...",99044,en,Another 48 Hrs.,"For the past four years, San Francisco cop Jac...",6.938921,['Paramount Pictures'],['United States of America'],...,The boys are back in town.,Another 48 Hrs.,"['prison', 'gas station', 'drug dealer', 'inve...","['Eddie Murphy', 'Nick Nolte', 'Brion James', ...","['Walter Hill', 'Roger Spottiswoode', 'James H...",19400000.0,288.0,26.0,True,


In [31]:
print(BoxOffice.revenue.isnull().sum())    # Final check to ensure that I have no missing values for my target variable

0


In [34]:
BoxOffice.to_csv(r'C:\Users\deann\Documents\Data\Box Office Prediction Data\BoxOfficeData_openingwknd.csv', index=False)