In [1]:
import numpy as np
import pandas as pd
from bqplot import pyplot as plt
from bqplot import *
import requests
import re
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

# Scraping

In [2]:
url = 'https://www.politico.com/mapdata-2016/2016-election/primary/results/map/president/'
r = requests.get(url)

r.raise_for_status()

html = r.text

In [3]:
soup = BeautifulSoup(html, 'html.parser')

All useful data are included in `<article>`:

In [4]:
articles = soup.find_all('article',attrs={'class':'timeline-group'})

Run a for loop to store the information for each state:

In [5]:
df_results = pd.DataFrame(columns=['electiontype', 'state', 'date',
                                   'party', 'candidate', 'votes', '%votes', 'delegates'])
for article in tqdm_notebook(articles):
    # parsing for election type
    election_type = article.find('h4').get_text().split()[1][:-1]
    election_type = election_type.replace('Primaries', 'Primary')
    election_type = election_type.replace('Caucuses', 'Caucus')
    election_type = 'Primary' if election_type == 'Primaries' else election_type
    # parsing for state name
    state = article.find('h3').get_text().strip()
    # parsing for election date
    election_date = article.find('p').get_text().strip()
    # parsing for info of each party
    parties = article.find_all('h5')
    for i in range(len(parties)):
        party_name = parties[i].get_text()
        party_dict = {'Democratic': 'type-democrat',
                      'Republican': 'type-republican'}
        tr = article.find_all('tr', attrs={'class': party_dict[party_name]})
        for cand in tr:
            cand_name = cand.find(
                'th', attrs={'class': 'results-name'}).get_text()
            cand_name = cand_name.replace('Winner ', '')
            percentage = cand.find(
                'td', attrs={'class': 'results-percentage'}).get_text()
            votes = cand.find(
                'td', attrs={'class': 'results-popular'}).get_text()
            # convert string to int
            votes = int(votes.replace(',',''))
            # if no delegates, return nan
            try:
                delegates = cand.find(
                    'td', attrs={'class': 'delegates-cell'}).get_text()
            except:
                delegates = np.nan
            df_results = df_results.append({
                'electiontype': election_type,
                'state': state,
                'date': election_date,
                'party': party_name,
                'candidate': cand_name,
                'votes': votes,
                '%votes': percentage,
                'delegates': delegates
            }, ignore_index=True)

HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




In [6]:
df_results = df_results.set_index(['electiontype','state','date','party']).sort_index()

df_results.to_pickle('df_results.pkl')

In [7]:
df_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,candidate,votes,%votes,delegates
electiontype,state,date,party,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Caucus,Alaska,"March 1, 2016",Democratic,B. Sanders,440,81.6%,14.0
Caucus,Alaska,"March 1, 2016",Democratic,H. Clinton,99,18.4%,4.0
Caucus,Alaska,"March 1, 2016",Democratic,R. De La Fuente,0,0.0%,
Caucus,Alaska,"March 1, 2016",Democratic,Uncommitted,0,0.0%,2.0
Caucus,Alaska,"March 1, 2016",Republican,T. Cruz,7973,36.4%,12.0


# Visualization

In [8]:
# Extract data for each party, only including Primary data
idx = pd.IndexSlice
dem_df = df_results.loc[idx['Primary',:,:,'Democratic']]
gop_df = df_results.loc[idx['Primary',:,:,'Republican']]

In [9]:
# load the data which connects state FIPS and name
codes = pd.read_csv('state_codes.csv')

## GOP

Obtain the winner for each state:

In [10]:
gop_winner = gop_df.groupby(level=1).apply(lambda x:x.sort_values('votes',ascending=False).iloc[0]['candidate'])

gop_winner.name='winner'

gop_winner = codes.set_index('Name').join(gop_winner)

gop_winner.head()

Unnamed: 0_level_0,FIPS,USPS,winner
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,1,AL,D. Trump
Alaska,2,AK,
Arizona,4,AZ,D. Trump
Arkansas,5,AR,D. Trump
California,6,CA,D. Trump


In [11]:
x_ord = OrdinalScale()
y_sc = LinearScale()

bar_gop = Bars(scales={'x': x_ord, 'y': y_sc},
           orientation='horizontal',colors=['#00CC00'])
label_gop = Label(scales={'x': y_sc, 'y': x_ord}, colors=['#000000'], update_on_move=True)
ax_x_gop = Axis(scale=x_ord, orientation='vertical')
ax_y_gop = Axis(scale=y_sc,label='# Votes')

bar_fig_gop = Figure(marks=[bar_gop, label_gop], axes=[ax_x_gop, ax_y_gop], padding_x=0.025, padding_y=0.025,
                title='General Election - State Polls')

In [12]:
gop_fips_result = gop_df.reset_index(level=1).merge(codes, left_on='state', right_on='Name')[['FIPS','candidate','votes']]

def hover_callback_gop(name, value):
    state_code = value['data']['id']
    bar_fig_gop.title = codes.set_index('FIPS').loc[state_code,'Name']
    if state_code not in gop_winner['FIPS'].values:
        return
    x = gop_fips_result[gop_fips_result['FIPS']==state_code].sort_values(by='votes')[
        'candidate'].values
    y = gop_fips_result[gop_fips_result['FIPS']==state_code].sort_values(by='votes')[
        'votes'].values
    bar_gop.x, bar_gop.y = x, y
    label_gop.x, label_gop.y, label_gop.text = np.zeros(len(x)), x, y

In [13]:
sc_geo_gop = AlbersUSA(scale_factor=1000)
color_gop = OrdinalColorScale(domain=['D. Trump', 'T. Cruz', 'J. Kasich'], colors=[
                              '#d65454', '#eabc3b', '#3ca0a0'])

map_styles_gop = {'color': dict(zip(gop_winner['FIPS'], gop_winner['winner'])),
              'scales': {'projection': sc_geo_gop, 'color': color_gop}, 'colors': {'default_color': '#E2E2E2'}}
axis_gop = ColorAxis(scale=color_gop)

states_map_gop = Map(map_data=topo_load(
    'map_data/USStatesMap.json'), tooltip=bar_fig_gop, **map_styles_gop)
map_fig_gop = Figure(marks=[states_map_gop], axes=[axis_gop],
                     title='Election Polls - Republican')
states_map_gop.on_hover(hover_callback_gop)
map_fig_gop

Figure(axes=[ColorAxis(scale=OrdinalColorScale(colors=['#d65454', '#eabc3b', '#3ca0a0'], domain=['D. Trump', '…

## DEM

In [14]:
dem_winner = dem_df.groupby(level=1).apply(lambda x:x.sort_values('votes',ascending=False).iloc[0]['candidate'])

dem_winner.name='winner'

dem_winner = codes.set_index('Name').join(dem_winner)

In [15]:
x_ord = OrdinalScale()
y_sc = LinearScale()

bar_dem = Bars(scales={'x': x_ord, 'y': y_sc},
           orientation='horizontal',colors=['#00CC00'])
label_dem = Label(scales={'x': y_sc, 'y': x_ord}, colors=['#000000'], update_on_move=True)
ax_x_dem = Axis(scale=x_ord, orientation='vertical')
ax_y_dem = Axis(scale=y_sc,label='# Votes')

bar_fig_dem = Figure(marks=[bar_dem, label_dem], axes=[ax_x_dem, ax_y_dem], padding_x=0.025, padding_y=0.025,
                title='General Election - State Polls')

In [16]:
bar_fig_dem

Figure(axes=[Axis(orientation='vertical', scale=OrdinalScale()), Axis(label='# Votes', scale=LinearScale())], …

In [17]:
dem_fips_result = dem_df.reset_index(level=1).merge(codes, left_on='state', right_on='Name')[['FIPS','candidate','votes']]

def hover_callback_dem(name, value):
    state_code = value['data']['id']
    bar_fig_dem.title = codes.set_index('FIPS').loc[state_code,'Name']
    if state_code not in dem_winner['FIPS'].values:
        return
    x = dem_fips_result[dem_fips_result['FIPS']==state_code].sort_values(by='votes')[
        'candidate'].values
    y = dem_fips_result[dem_fips_result['FIPS']==state_code].sort_values(by='votes')[
        'votes'].values
    bar_dem.x, bar_dem.y = x, y
    label_dem.x, label_dem.y, label_dem.text = np.zeros(len(x)), x, y

In [18]:
dem_winner

Unnamed: 0_level_0,FIPS,USPS,winner
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,1,AL,H. Clinton
Alaska,2,AK,
Arizona,4,AZ,H. Clinton
Arkansas,5,AR,H. Clinton
California,6,CA,H. Clinton
Colorado,8,CO,
Connecticut,9,CT,H. Clinton
Delaware,10,DE,H. Clinton
District of Columbia,11,DC,
Florida,12,FL,H. Clinton


In [19]:
sc_geo_dem = AlbersUSA(scale_factor=1000)
color_dem = OrdinalColorScale(domain=['B. Sanders', 'H. Clinton'], colors=[
                              '#83bc5e', '#5fa0d6'])

map_styles_dem = {'color': dict(zip(dem_winner['FIPS'], dem_winner['winner'])),
              'scales': {'projection': sc_geo_dem, 'color': color_dem}, 'colors': {'default_color': '#E2E2E2'}}
axis_dem = ColorAxis(scale=color_dem)

states_map_dem = Map(map_data=topo_load(
    'map_data/USStatesMap.json'), tooltip=bar_fig_dem, **map_styles_dem)
map_fig_dem = Figure(marks=[states_map_dem], axes=[axis_dem],
                     title='Election Polls - Republican')
states_map_dem.on_hover(hover_callback_dem)
map_fig_dem

Figure(axes=[ColorAxis(scale=OrdinalColorScale(colors=['#83bc5e', '#5fa0d6'], domain=['B. Sanders', 'H. Clinto…

## Visualiztion for GOP and DEM

In [20]:
from ipywidgets import VBox

VBox([map_fig_gop, map_fig_dem])

VBox(children=(Figure(axes=[ColorAxis(scale=OrdinalColorScale(colors=['#d65454', '#eabc3b', '#3ca0a0'], domain…