In [25]:
%matplotlib inline
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup

# Using a Web API

The objective for class today, is to augment the blockbuster box office data with an inflation adjusted price.  But rather than using the spreadsheet of data we found previously, today we will use a web API that does the conversion work for us.

But before we dive into that lets take a step back and think about the following questions and concepts:

        ○ Transferring data across the web 
			§ Data representation  -- numbers / floats / strings
			§ Serialization  
		○ Two main serialization technologies
			§ XML
			§ JSON
		○ JSON is the clear winner
		○ JSON and Python get along great
		○ Dictionaries are the key!
		
		○ What do we mean by an API ?
			§ Some website has data we want… Some website has the ability to compute something for us
			§ How do we ask the website to do some work for us?
			§ Query String
			§ REST (Representational State Transfer)  -- Next week


In [65]:
data= {'year': [1954, 1955, 2001], 'price': [0.38, 0.44, 2.59]}

In [67]:
data['price']

[0.38, 0.44, 2.59]

In [68]:
x = json.dumps(data)

In [69]:
type(x)

str

In [71]:
data = None

In [73]:
data = json.loads(x)


In [75]:
type(data)

dict

In [76]:
data.keys()

dict_keys(['year', 'price'])

In [77]:
data['year']

[1954, 1955, 2001]

In [17]:
class Foo:
    def __init__(self,x,y,z):
        self.x = x
        self.y = y
        self.z = z

    def bar():
        pass

t = Foo(1,2,3)

In [23]:
t.__dict__

{'x': 1, 'y': 2, 'z': 3}

In [18]:
json.dumps(t.__dict__)

'{"y": 2, "x": 1, "z": 3}'

###  Using requests to get some json data


In [92]:
r = requests.get("https://www.statbureau.org/get-data-json?country=united-states")

In [93]:
dir(r)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

We can look at the results with either ``r.text`` or EVEN BETTER ``r.json()`` which will parse the JSON text into a real data object.

In [96]:
r.json()

[{'Country': 0,
  'InflationRate': -0.16221508641790922,
  'InflationRateFormatted': '-0.16',
  'InflationRateRounded': -0.16,
  'Month': '/Date(1467345600000)/',
  'MonthFormatted': '2016-07-01'},
 {'Country': 0,
  'InflationRate': 0.3338383922476232,
  'InflationRateFormatted': '0.33',
  'InflationRateRounded': 0.33,
  'Month': '/Date(1464753600000)/',
  'MonthFormatted': '2016-06-01'},
 {'Country': 0,
  'InflationRate': 0.40750477512005717,
  'InflationRateFormatted': '0.41',
  'InflationRateRounded': 0.41,
  'Month': '/Date(1462075200000)/',
  'MonthFormatted': '2016-05-01'},
 {'Country': 0,
  'InflationRate': 0.4741067979104026,
  'InflationRateFormatted': '0.47',
  'InflationRateRounded': 0.47,
  'Month': '/Date(1459483200000)/',
  'MonthFormatted': '2016-04-01'},
 {'Country': 0,
  'InflationRate': 0.4306000143392757,
  'InflationRateFormatted': '0.43',
  'InflationRateRounded': 0.43,
  'Month': '/Date(1456808400000)/',
  'MonthFormatted': '2016-03-01'},
 {'Country': 0,
  'Inflat

In [82]:
inflation = json.loads(r.text)

In [91]:
inflation[-1]

{'Country': 0,
 'InflationRate': 0,
 'InflationRateFormatted': '0.00',
 'InflationRateRounded': 0,
 'Month': '/Date(-1796065200000)/',
 'MonthFormatted': '1913-02-01'}

In [15]:
y = json.loads(x)
type(y)

dict

In [106]:
adjusted = requests.get("https://www.statbureau.org/calculate-inflation-price-json?country=united-states&start=1954/1/1&end=2016/8/1&amount=0.32")

In [107]:
adjusted.status_code


200

In [119]:
type(adjusted.json())

str

In [121]:
import math
math.nan

nan

In [135]:
def get_adjusted_price(year, price):
    req_url = "https://www.statbureau.org/calculate-inflation-price-json?country=united-states&start={}/1/1&end=2016/8/1&amount={}"
    resp = requests.get(req_url.format(year,price))
    if resp.status_code == 200:
        adj_price = float(resp.json().replace('$','').replace(',','').replace(' ',''))
    else:
        adj_price = math.nan
    return adj_price

print(get_adjusted_price(1990,1.00))

1.91


### Don't work too hard

Concatenating strings with ``+`` is hard and error prone.  Use the Python tools for formatted strings!

In [115]:
"hello %s  :::::: %d" % ('world', 324234)

'hello world  :::::: 324234'

In [116]:
"hello {} :::: {}".format('world', 123123123)

'hello world :::: 123123123'

## Now lets get the Box office data

In [27]:
page = requests.get('http://www.the-numbers.com/movie/records/All-Time-Domestic-Box-Office')
soup = BeautifulSoup(page.text,"lxml")

In [44]:
def unformat_money(s):
    '''
    take a string that is formatted as dollars, and return a float or int

    '''
    news = s.replace('$','').replace(',','')
    if "." in news:
        return float(news)
    else:
        return int(news)


In [132]:
d = {'year':[], 'title':[], 'domestic':[] ,'international':[], 'worldwide':[]}
rows = soup.find_all('tr')
for row in rows[1:]:
    row_data = row.find_all('td')
    d['year'].append(int(row_data[1].text))
    d['title'].append(row_data[2].find('a').text)
    d['domestic'].append(unformat_money(row_data[3].text))
    d['international'].append(unformat_money(row_data[4].text))
    d['worldwide'].append(unformat_money(row_data[5].text))


In [133]:
df = pd.DataFrame(d)
df.head()

Unnamed: 0,domestic,international,title,worldwide,year
0,936662225,1122000000,Star Wars Ep. VII: The Force Awakens,2058662225,2015
1,760507625,2023411357,Avatar,2783918982,2009
2,658672302,1548943366,Titanic,2207615668,1997
3,652198010,1018130015,Jurassic World,1670328025,2015
4,623279547,896200000,The Avengers,1519479547,2012


Ok, the function we wrote above won't work perfectly for us because we want to use the ``apply`` method to automatically iterate over each row of the dataframe and we want to extract the year and domestic price from each row.  apply expects to get a function that takes a SINGLE parameter not TWO.  The single parameter corresponds to a row or a column depending on the value of the axis parameter.  So our wrapper will have to unpack the row and call the ``get_adjusted_price`` function, and simply pass on the return value.

In [136]:
def wrapper(the_row):
    return get_adjusted_price(the_row.year, the_row.domestic)

df['adj_domestic'] = df.apply(wrapper, axis=1)

In [130]:
df.head()

Unnamed: 0,domestic,international,title,worldwide,year,adj_domestic
0,936662225,1122000000,Star Wars Ep. VII: The Force Awakens,2058662225,2015,959938000.0
1,760507625,2023411357,Avatar,2783918982,2009,870549500.0
2,658672302,1548943366,Titanic,2207615668,1997,999416900.0
3,652198010,1018130015,Jurassic World,1670328025,2015,668404900.0
4,623279547,896200000,The Avengers,1519479547,2012,664638700.0


In [131]:
df.sort_values(by=['adj_domestic'],ascending=False)

Unnamed: 0,domestic,international,title,worldwide,year,adj_domestic
8,460998007,325600000,Star Wars Ep. IV: A New Hope,786598007,1977,1.906148e+09
89,260000000,210700000,Jaws,470700000,1975,1.205553e+09
12,435110554,357854772,ET: The Extra-Terrestrial,792965326,1982,1.113915e+09
2,658672302,1548943366,Titanic,2207615668,1997,9.994169e+08
0,936662225,1122000000,Star Wars Ep. VII: The Force Awakens,2058662225,2015,9.599380e+08
73,290271960,243900000,Star Wars Ep. V: The Empire Strikes Back,534171960,1980,9.107311e+08
1,760507625,2023411357,Avatar,2783918982,2009,8.705495e+08
56,309205079,263500000,Star Wars Ep. VI: Return of the Jedi,572705079,1983,7.623901e+08
15,422780140,564700000,The Lion King,987480140,1994,6.978105e+08
7,474544677,552500000,Star Wars Ep. I: The Phantom Menace,1027044677,1999,6.967526e+08
