In [None]:
'''
CLASS: Getting Data from APIs
What is an API?
- Application Programming Interface
- Structured way to expose specific functionality and data access to users
- Web APIs usually follow the "REST" standard
How to interact with a REST API:
- Make a "request" to a specific URL (an "endpoint"), and get the data back in a "response"
- Most relevant request method for us is GET (other methods: POST, PUT, DELETE)
- Response is often JSON format
- Web console is sometimes available (allows you to explore an API)
'''

In [2]:
# read IMDb data into a DataFrame: we want a year column!
import pandas as pd
import os
path = "https://github.com/JamesByers/GA-SEA-DAT2/raw/master/data/"
movies = pd.read_csv(path +'imdb_1000.csv')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [3]:
# use requests library to interact with a URL
import requests
r = requests.get('http://www.omdbapi.com/?t=the shawshank redemption&r=json&type=movie')

In [4]:
# check the status: 200 means success, 4xx means error
r.status_code

200

In [5]:
# view the raw response text
r.text

u'{"Title":"The Shawshank Redemption","Year":"1994","Rated":"R","Released":"14 Oct 1994","Runtime":"142 min","Genre":"Crime, Drama","Director":"Frank Darabont","Writer":"Stephen King (short story \\"Rita Hayworth and Shawshank Redemption\\"), Frank Darabont (screenplay)","Actors":"Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler","Plot":"Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.","Language":"English","Country":"USA","Awards":"Nominated for 7 Oscars. Another 14 wins & 21 nominations.","Poster":"http://ia.media-imdb.com/images/M/MV5BODU4MjU4NjIwNl5BMl5BanBnXkFtZTgwMDU2MjEyMDE@._V1_SX300.jpg","Metascore":"80","imdbRating":"9.3","imdbVotes":"1,626,900","imdbID":"tt0111161","Type":"movie","Response":"True"}'

In [6]:
# decode the JSON response body into a dictionary
r.json()

{u'Actors': u'Tim Robbins, Morgan Freeman, Bob Gunton, William Sadler',
 u'Awards': u'Nominated for 7 Oscars. Another 14 wins & 21 nominations.',
 u'Country': u'USA',
 u'Director': u'Frank Darabont',
 u'Genre': u'Crime, Drama',
 u'Language': u'English',
 u'Metascore': u'80',
 u'Plot': u'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
 u'Poster': u'http://ia.media-imdb.com/images/M/MV5BODU4MjU4NjIwNl5BMl5BanBnXkFtZTgwMDU2MjEyMDE@._V1_SX300.jpg',
 u'Rated': u'R',
 u'Released': u'14 Oct 1994',
 u'Response': u'True',
 u'Runtime': u'142 min',
 u'Title': u'The Shawshank Redemption',
 u'Type': u'movie',
 u'Writer': u'Stephen King (short story "Rita Hayworth and Shawshank Redemption"), Frank Darabont (screenplay)',
 u'Year': u'1994',
 u'imdbID': u'tt0111161',
 u'imdbRating': u'9.3',
 u'imdbVotes': u'1,626,900'}

In [7]:
# extracting the year from the dictionary
r.json()['Year']

u'1994'

In [9]:
# what happens if the movie name is not recognized?
r = requests.get('http://www.omdbapi.com/?t=blahblahblah&r=json&type=movie')
r.status_code

200

In [10]:
r.json()

{u'Error': u'Movie not found!', u'Response': u'False'}

In [11]:
# define a function to return the year
def get_movie_year(title):
    r = requests.get('http://www.omdbapi.com/?t=' + title + '&r=json&type=movie')
    info = r.json()
    if info['Response'] == 'True':
        return int(info['Year'])
    else:
        return None

In [13]:
# test the function
print get_movie_year('The Shawshank Redemption')

1994


In [14]:
print get_movie_year('blahblahblah')

None


In [15]:
# create a smaller DataFrame for testing
top_movies = movies.head().copy()

In [18]:
# write a for loop to build a list of years
from time import sleep
years = []
for title in top_movies.title:
    years.append(get_movie_year(title))
    sleep(1)

print years

[1994, 1972, 1974, 2008, 1994]


In [19]:
# check that the DataFrame and the list of years are the same length
assert(len(top_movies) == len(years))

In [20]:
# save that list as a new column
top_movies['year'] = years

In [None]:
'''
Bonus content: Updating the DataFrame as part of a loop
'''

# enumerate allows you to access the item location while iterating
letters = ['a', 'b', 'c']
for index, letter in enumerate(letters):
    print index, letter

In [21]:
# iterrows method for DataFrames is similar
for index, row in top_movies.iterrows():
    print index, row.title

0 The Shawshank Redemption
1 The Godfather
2 The Godfather: Part II
3 The Dark Knight
4 Pulp Fiction


In [23]:
# create a new column and set a default value
movies['year'] = -1

print movies.head()

   star_rating                     title content_rating   genre  duration  \
0          9.3  The Shawshank Redemption              R   Crime       142   
1          9.2             The Godfather              R   Crime       175   
2          9.1    The Godfather: Part II              R   Crime       200   
3          9.0           The Dark Knight          PG-13  Action       152   
4          8.9              Pulp Fiction              R   Crime       154   

                                         actors_list  year  
0  [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...    -1  
1    [u'Marlon Brando', u'Al Pacino', u'James Caan']    -1  
2  [u'Al Pacino', u'Robert De Niro', u'Robert Duv...    -1  
3  [u'Christian Bale', u'Heath Ledger', u'Aaron E...    -1  
4  [u'John Travolta', u'Uma Thurman', u'Samuel L....    -1  


In [24]:
# loc method allows you to access a DataFrame element by 'label'
movies.loc[0, 'year'] = 1994

In [25]:
# write a for loop to update the year for the first three movies
for index, row in movies.iterrows():
    if index < 3:
        movies.loc[index, 'year'] = get_movie_year(row.title)
        sleep(1)
    else:
        break
        
print movies.head()

   star_rating                     title content_rating   genre  duration  \
0          9.3  The Shawshank Redemption              R   Crime       142   
1          9.2             The Godfather              R   Crime       175   
2          9.1    The Godfather: Part II              R   Crime       200   
3          9.0           The Dark Knight          PG-13  Action       152   
4          8.9              Pulp Fiction              R   Crime       154   

                                         actors_list  year  
0  [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...  1994  
1    [u'Marlon Brando', u'Al Pacino', u'James Caan']  1972  
2  [u'Al Pacino', u'Robert De Niro', u'Robert Duv...  1974  
3  [u'Christian Bale', u'Heath Ledger', u'Aaron E...    -1  
4  [u'John Travolta', u'Uma Thurman', u'Samuel L....    -1  


In [None]:
'''
Other considerations when accessing APIs:
- Most APIs require you to have an access key (which you should store outside your code)
- Most APIs limit the number of API calls you can make (per day, hour, minute, etc.)
- Not all APIs are free
- Not all APIs are well-documented
- Pay attention to the API version
Python wrapper is another option for accessing an API:
- Set of functions that "wrap" the API code for ease of use
- Potentially simplifies your code
- But, wrapper could have bugs or be out-of-date or poorly documented
'''
Status API Training Shop Blog About