In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imdb import IMDb

In [2]:
ia = IMDb()    #initializes IMDbPy with ia as an alias

In [3]:
ia.get_movie_infoset()   #pulling out the list of categories of information that's available via IMDbPy

['airing',
 'akas',
 'alternate versions',
 'awards',
 'connections',
 'crazy credits',
 'critic reviews',
 'episodes',
 'external reviews',
 'external sites',
 'faqs',
 'full credits',
 'goofs',
 'keywords',
 'list',
 'locations',
 'main',
 'misc sites',
 'news',
 'official sites',
 'parents guide',
 'photo sites',
 'plot',
 'quotes',
 'recommendations',
 'release dates',
 'release info',
 'reviews',
 'sound clips',
 'soundtrack',
 'synopsis',
 'taglines',
 'technical',
 'trivia',
 'tv schedule',
 'video clips',
 'vote details']

In [4]:
movie = ia.get_movie('2395427', info=['main'])    #pulls info for The Avengers: Age of Ultron to test

In [5]:
movie.infoset2keys       #displays the information contained in the 'main' key for the movie

{'main': ['localized title',
  'cast',
  'genres',
  'runtimes',
  'countries',
  'country codes',
  'language codes',
  'color info',
  'aspect ratio',
  'sound mix',
  'box office',
  'certificates',
  'original air date',
  'rating',
  'votes',
  'cover url',
  'imdbID',
  'plot outline',
  'languages',
  'title',
  'year',
  'kind',
  'directors',
  'writers',
  'producers',
  'composers',
  'cinematographers',
  'editors',
  'editorial department',
  'casting directors',
  'production designers',
  'art directors',
  'set decorators',
  'costume designers',
  'make up department',
  'production managers',
  'assistant directors',
  'art department',
  'sound department',
  'special effects',
  'visual effects',
  'stunts',
  'camera department',
  'animation department',
  'casting department',
  'costume departmen',
  'location management',
  'music department',
  'script department',
  'transportation department',
  'miscellaneous',
  'thanks',
  'akas',
  'writer',
  'director'

In [6]:
MovieBox = movie.get('box office')  #stores object under dict[main][box office]
MovieRev = movie.get('box office')['Cumulative Worldwide Gross']   #stores cumulative worldwide gross revenue

In [7]:
MovieBox
#Getting just the 'box office' information returns a dictionary of all included revenue values

{'Budget': '$250,000,000 (estimated)',
 'Opening Weekend United States': '$191,271,109, 01 May 2015',
 'Cumulative Worldwide Gross': '$1,405,413,868'}

In [8]:
MovieRev
#calling the .get() on 'box office' and also the dictionary key for 'Cumulative Worldwide Gross' returns the target value needed

'$1,405,413,868'

In [9]:
MovieRev = MovieRev.replace('$', '').replace(',','')
MovieRev = int(MovieRev)
print(type(MovieRev))
print(MovieRev)
#using the string replace method on the target value strips all '/$'' and ',' from the revenue numbers - this will work!

<class 'int'>
1405413868


### The next step here is to tinker and discover how I can do this in 'one go' for my entire list of imdb id's

Now I need to load in my boxoffice_data file and pull out the imdb_id and see what I can do to pull our box office revenue information



In [33]:
boxoffice = pd.read_csv(r'C:\Users\deann\Documents\Data\Box Office Prediction Data\boxoffice_data.csv')
boxoffice_2 = pd.read_csv(r"C:\Users\deann\Documents\Data\Box Office Prediction Data\test.csv")

In [11]:
boxoffice.imdb_id.head()   #since I stripped the 'tt' portion of the ID off pandas read the ID's in as integers

0    2637294
1     368933
2    2582802
3    1821480
4    1380152
Name: imdb_id, dtype: int64

In [35]:
boxoffice_2.imdb_id = boxoffice_2.imdb_id.str.replace('tt', '')
boxoffice_2.imdb_id.head()

0    1226251
1    0051380
2    0118556
3    1255953
4    0418753
Name: imdb_id, dtype: object

In [12]:
values = boxoffice.imdb_id.values
print(type(values))

<class 'numpy.ndarray'>


In [13]:
#  movie_info = ia.get_movie(values, info=['main'])      #Okay, here I see that I need to create a function that passes each value


In [14]:

def movie_info(x):
    """Creates a simple function that gets the movie info I need for a single film
        I plan to vectorize this over a numpy array of IMDB values"""
    error_list = []
    error_count = 0
    try: 
        # storing the main dictionary for the film with imdb_id = x
        info = ia.get_movie(x, info=['main'])
        # trying to return the cumulative worldwide gross revenue for film with imdb_id = x
        MovieRev = info.get('box office')['Cumulative Worldwide Gross']
        # transforming the string value to a clean integer            
        MovieRev = MovieRev.replace('$', '').replace(',','')
        MovieRev = int(MovieRev)
        # returns the integer value for the cumulative worldwide gross revenue for film with imdb_id = x
        return MovieRev     
    except:
        error_list.append(str(x))
        error_count = error_count+1
        print(error_count)
        return "NaN"

In [15]:
#testing the function on the same Age of Ultron film ID I used above for both string and integer inputs

UltronRev = movie_info('2395427')
UltronRev

1405413868

In [16]:
apply = boxoffice.imdb_id.apply(movie_info)

apply

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


2021-04-29 15:19:02,038 CRITICAL [imdbpy] C:\Users\deann\Anaconda3\lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt1855199/reference', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>},); kwds: {}
Traceback (most recent call last):
  File "C:\Users\deann\Anaconda3\lib\site-packages\imdb\parser\http\__init__.py", line 220, in retrieve_unicode
    response = uopener.open(url)
  File "C:\Users\deann\Anaconda3\lib\urllib\request.py", line 531, in open
    response = meth(req, response)
  File "C:\Users\deann\Anaconda3\lib\urllib\request.py", line 640, in http_response
    response = self.parent.error(
  File "C:\Users\deann\Anaconda3\lib\urllib\request.py", line 569, in error
    return self._call_chain(*args)
  File "C:\Users\deann\Anaconda3\lib\urllib\request.py", line 502, in _call_chain
    result = func(*args)
  File "C

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


0             NaN
1       134734481
2        48982041
3             NaN
4             NaN
          ...    
2995          NaN
2996          NaN
2997          NaN
2998    171963386
2999     86648359
Name: imdb_id, Length: 3000, dtype: object

Action items:

1/) Search for this api in github

2/) search documentation again, this SHOULD be in their website

In [25]:
none_ap = apply.replace('NaN', np.nan)
print(none_ap)

0               NaN
1       134734481.0
2        48982041.0
3               NaN
4               NaN
           ...     
2995            NaN
2996            NaN
2997            NaN
2998    171963386.0
2999     86648359.0
Name: imdb_id, Length: 3000, dtype: float64


In [38]:
missing_train=none_ap.isnull().sum()
print('You have ' + str(missing_train) + ' unusable films in your training dataset')

You have 2062 unusable films in your training dataset


In [36]:
apply_test = boxoffice_2.imdb_id.apply(movie_info)

apply_test

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


0       42496749
1            NaN
2            NaN
3       12597210
4            NaN
          ...   
4393         NaN
4394    71274967
4395         NaN
4396         NaN
4397         NaN
Name: imdb_id, Length: 4398, dtype: object

In [40]:
none_test = apply_test.replace('NaN', np.nan)
print(none_test)

0       42496749.0
1              NaN
2              NaN
3       12597210.0
4              NaN
           ...    
4393           NaN
4394    71274967.0
4395           NaN
4396           NaN
4397           NaN
Name: imdb_id, Length: 4398, dtype: float64


In [41]:
missing_test=none_test.isnull().sum()
print('You have ' + str(missing_test) + ' unusable films in your test dataset')

You have 3004 unusable films in your test dataset


In [42]:
(4398-3004) + (3000-2062)

2332