# Scrape the Location Counts

In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

In [3]:
import requests
response = requests.get("https://www.menuism.com/restaurant-locations/starbucks-coffee-39564")

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [5]:
data = {}
for item in soup.find_all('li', class_=''):
    link = item.find('a')
    
    # Check if the <a> tag exists
    if link:
        link_text = link.get_text()
        parts = link_text.split(' Starbucks Coffee locations ')

        # Check if parts has two elements before proceeding
        if len(parts) == 2:
            state = parts[0].strip()
            # Try to extract the count and handle potential errors
            try:
                count = int(parts[1].strip('()'))
                data[state] = count
            except ValueError:
                print(f"Unable to extract count from: {link_text}")

data

{}

In [6]:
def stateabb(state_name):
    """
    Convert a state name to its state abbreviation.
    """
    state_abbreviations = {
        'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
        'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
        'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
        'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
        'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
        'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
        'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
        'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
        'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
        'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
    }
    
    return state_abbreviations.get(state_name, "Unknown")

In [7]:
# Creating a list of dictionaries with the state abbreviation and location count
data_list = [{"state_abbreviation": stateabb(state), "location_count": count} for state, count in data.items()]

# Converting the list of dictionaries into a DataFrame for better visualization and potential further analysis
import pandas as pd
df = pd.DataFrame(data_list)
df

In [38]:
df = pd.read_csv('/Users/ruojiakuang/Downloads/df_lab4_scraped (1).csv')
df.head()

Unnamed: 0,State,Coffee Chain,State Abbreviation,Location Count,Population
0,Alaska,Starbucks,AK,24,733391
1,Alabama,Starbucks,AL,73,5024279
2,Arkansas,Starbucks,AR,33,3011524
3,Arizona,Starbucks,AZ,279,7151502
4,California,Starbucks,CA,2362,39538223


In [43]:
region_mapping = {
    'Northeast': ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island','Vermont', 'New Jersey', 'New York', 'Pennsylvania'],
    'Midwest': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'],
    'South': ['Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'Washington, D.C.', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
    'West': ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
}
state_to_region = {state: region for region, states in region_mapping.items() for state in states}
df["Region"] = df["State"].map(state_to_region)
df.head()

Unnamed: 0,State,Coffee Chain,State Abbreviation,Location Count,Population,Region,Revenue($1k)
0,Alaska,Starbucks,AK,24,733391,West,21600
1,Alabama,Starbucks,AL,73,5024279,South,65700
2,Arkansas,Starbucks,AR,33,3011524,South,29700
3,Arizona,Starbucks,AZ,279,7151502,West,251100
4,California,Starbucks,CA,2362,39538223,West,2125800


# Analyze

Starbucks generated over $900K per store in 2022

In [44]:
df['Revenue($1k)'] = df['Location Count']*900

In [45]:
df.head()

Unnamed: 0,State,Coffee Chain,State Abbreviation,Location Count,Population,Region,Revenue($1k)
0,Alaska,Starbucks,AK,24,733391,West,21600
1,Alabama,Starbucks,AL,73,5024279,South,65700
2,Arkansas,Starbucks,AR,33,3011524,South,29700
3,Arizona,Starbucks,AZ,279,7151502,West,251100
4,California,Starbucks,CA,2362,39538223,West,2125800


In [47]:
df.sort_values(by = ["Revenue($1k)"], ascending=False)

Unnamed: 0,State,Coffee Chain,State Abbreviation,Location Count,Population,Region,Revenue($1k)
4,California,Starbucks,CA,2362,39538223,West,2125800
222,California,McDonald's,CA,1623,39538223,West,1460700
261,Texas,McDonald's,TX,1303,29145505,South,1172700
227,Florida,McDonald's,FL,1142,21538187,South,1027800
68,Massachusetts,Dunkin' Donuts,MA,1101,7029917,Northeast,990900
...,...,...,...,...,...,...,...
215,North Carolina,The Coffee Bean & Tea Leaf,NC,1,10439388,South,900
107,Delaware,Tim Horton's,DE,1,989948,South,900
102,Pennsylvania,Peet's Coffee & Tea,PA,1,13002700,Northeast,900
51,Alabama,Dunkin' Donuts,AL,1,5024279,South,900


In [48]:
df[["Revenue($1k)", "Region"]].groupby("Region").sum()

Unnamed: 0_level_0,Revenue($1k)
Region,Unnamed: 1_level_1
Midwest,7661700
Northeast,7647300
South,10719000
West,7881300


In [56]:
df.groupby(["State", "Coffee Chain"]).agg({'Revenue($1k)': 'sum'}).sort_values(by="Revenue($1k)", ascending=False).reset_index()

Unnamed: 0,State,Coffee Chain,Revenue($1k)
0,California,Starbucks,2125800
1,California,McDonald's,1460700
2,Texas,McDonald's,1172700
3,Florida,McDonald's,1027800
4,Massachusetts,Dunkin' Donuts,990900
...,...,...,...
264,Louisiana,Panera Bread,900
265,Maine,Au Bon Pain,900
266,Virginia,Tim Horton's,900
267,Missouri,Tim Horton's,900


We assume that each store roughly has 900k revenue in 2022. Usually the number of coffee chains is consistant with the size of population. In reality, the stores having higher traffic usually generate greater revenue. Based on the tables above, McDonald's and Starbucks are the leading coffee chains in the U.S. in terms of prevalence and revenue. Au Bon Pain has the smallest size among those coffee chains, however, it is more common in Northeast region. We can see the revenue in the South region is the highest revenue region since it includes more states. 