Items used during the workshop:
https://tinyurl.com/photographer-density-in-the-us # Python notebook from the workshop.
https://www.crummy.com/software/BeautifulSoup/bs4/doc/ # Beautiful Soup Documentation.
https://elitedatascience.com/python-web-scraping-libraries # Different Webscraping libraries in Python.


Environment setup

In [None]:
# Import pertinent packages
import requests                                              # use to retrieve data from the web
from bs4 import BeautifulSoup                                # use to convert raw HTML into a useable programming object
import time, re                                              # use time to calculate elapsed time code takes to use
                                                             # use re for some natural language processing
import pandas as pd                                          # use pandas for data manipulation
from plotly.offline import init_notebook_mode, iplot         # use plotly for sweet visuals

Create urls for each state

In [None]:
# This the main url for the photography directory project site
url = 'https://www.photographydirectoryproject.com/professional-photographers/usa/'

# Make a list of all 50 United States
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
          'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
          'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
          'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
          'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
          'West Virginia', 'Wisconsin', 'Wyoming']

# Make sure there are 50
len(states)

Count number of photographers for the 50 United States

In [None]:
# Make a list of all 50 unique urls for the 50 United States
states_links = [url+state.lower()+'/' if len(state)==1 else url+state.replace(' ', '-').lower()+'/' for state in states]

# Make sure it worked
states_links[:5]

Visualize data in a table

In [None]:
# Initialize a dictionary to append data to
photographers_by_state = {}

# Start a timer
s = time.time()

# Loop through each unique url, request the HTML, turn HTML into a useful programming object,
# parse through and get the number of photographers listed,
# then append this knowledge to our dictionary from above.
# If that doesn't work, this means there are 0 photographers listed
# for that state on this website.
for num, unique_url in enumerate(states_links):
    try:
        html = requests.get(unique_url)
        soup = BeautifulSoup(html.content, 'lxml')
        amount = re.findall('<span class="ia-sorting__found">Listings found: (.+)</span>', str(soup))
        photographers_by_state[states[num]] = int(amount[0])
    except:
        photographers_by_state[states[num]] = 0
        
# Print how long it took for this code to execute
print('It took {:.2f} seconds to run this code.'.format(time.time()-s))

Get each 50 United State's code and make a new column in table

In [None]:
# Turn dictionary into a table
photographers_table = pd.DataFrame(list(photographers_by_state.items()), columns=['state', 'total'])

# Make sure the table was made correctly
photographers_table.head()

In [None]:
# Get a random CSV file of United State data I found online 
temperary_table = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv')

# Take the United State codes column and attach it to our photographer table
photographers_table['code'] = temperary_table['code']

# Make sure the column was attached correctly
photographers_table.head()

Visualize data in a graphic

In [None]:
init_notebook_mode()

scale = [[0.0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'], [0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'], [0.8, 'rgb(117,107,177)'], [1.0, 'rgb(84,39,143)']]

data = [dict(type='choropleth', colorscale=scale, autocolorscale=False, locations=photographers_table['code'],
             z=photographers_table['total'].astype(float), locationmode='USA-states', text=photographers_table['state'],
             marker=dict(line=dict(color='rgb(255,255,255)', width=2)),
             colorbar=dict(title="Total Photographers"))]

layout = dict(title='Photographer Density in the United States as of 15 Mar 2018<br>(Hover for breakdown)',
              geo=dict(scope='usa', projection=dict(type='albers usa'), showlakes=True, lakecolor='rgb(255, 255, 255)'))

fig = dict(data=data, layout=layout)
iplot(fig)

