## Webscraping Using Beautiful Soup and Pandas

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# in this demo, we will be scraping a list of the top cards in the mobile game Clash Royale
# this list will contain card names, win rates, usage rates, and rates of change for both metrics

url = 'https://statsroyale.com/top/cards'
page = requests.get(url)
html = BeautifulSoup(page.text, 'lxml')

# To quote the Reddit user zanderman12 on r/datascience:
# "Beautiful Soup can do a lot more scraping wise than pd.read_html in my experience.
# pd.read_html is great when there is a table already, but with Beautiful Soup you can grab lots of different information. 
# Just as an example I’ve found the info I need in the alt text of images that I looped through and then formed into a dataframe."

In [2]:
table = html.find('div', class_ = 'popularCards__table')

# get the "Most Popular Cards" table

In [3]:
headers = []
headers_row = table.find_all('div', class_ = 'popularCards__headerCaption')

for i in headers_row:
    print(i)
    
# there is a div class=popularCards__headerCaption element with no text (being used to create space between 'Card' and 'Win rate')
# need to filter it out before setting the headers as our df column names
# otherwise we'd have a second column with no name

<div class="popularCards__headerCaption">Card</div>
<div class="popularCards__headerCaption"></div>
<div class="popularCards__headerCaption">Win rate</div>
<div class="popularCards__headerCaption">Usage</div>


In [4]:
for header in headers_row:
    if len(header.text) > 0:   
        header_name = header.text
        headers.append(header_name)
        
df = pd.DataFrame(columns = headers)
df

# we got our headers set
# but Stats Royale also lists % change for win rate and usage of each card
# We're gonna scrape those too, so let's add columns for those values

Unnamed: 0,Card,Win rate,Usage


In [5]:
df['Win rate change'] = ''
df['Usage change'] = ''
df

Unnamed: 0,Card,Win rate,Usage,Win rate change,Usage change


In [6]:
cards = table.find_all('div', class_ = 'popularCards__card')
for card in cards:
    name = card.find_all('div', class_ = 'popularCards__row')[1]
    name = name.text
    
    # the first thing in this HTML element is the card's png image, we wanna skip that, so we index [1]
    
    winrate_box = card.find_all('div', class_ = 'popularCards__winrate')[0]
    winrate = winrate_box.contents[0].text.strip()

    # use contents to access an element's direct children
    # the win rate is a string nested right under the popularCards__winrate div, so we index [0]
    
    winrate_change = winrate_box.contents[3].text.strip()
    winrate_change = ''.join(winrate_change.split())[:-1]
    
    # the win rate change is nested 4 spots down under the popularCards__winrate div, so we index [3]
    # but it includes the % sign, and there's whitespace everywhere, including between the value and %
    # so we use the .join + .split combo to remove all spaces, and indexing to remove the % (the last char)
    
    usage_box = card.find_all('div', class_ = 'popularCards__usage')[0]
    usage = usage_box.contents[0].text.strip()

    usage_change = usage_box.contents[3].text.strip()
    usage_change = ''.join(usage_change.split())[:-1]
    
    # repeat for usage stats (uses the same HTML structure)
    
    row_data = [name, winrate, usage, winrate_change, usage_change]
    length = len(df)
    df.loc[length] = row_data
    
    # feed data into df rows, making sure the order matches the columns
    # loc[length] adds a new row every time the loop runs, because len(df) increments each time a row is added

In [7]:
df

Unnamed: 0,Card,Win rate,Usage,Win rate change,Usage change
0,The Log,62.7,36.3,+4.48,–4.64
1,Goblin Barrel,64.7,32.9,+4.71,+0.72
2,Fireball,66.2,32.8,+4.68,+2.4
3,Mini P.E.K.K.A,67.8,32.0,+4.41,+3.84
4,Skeleton Army,64.8,31.7,+4.09,–1.44
...,...,...,...,...,...
104,Heal Spirit,40.4,0.3,–10.41,–0.08
105,Three Musketeers,44.7,0.2,–6.4,+0.08
106,Monk,48.2,0.2,–7.6,–0.24
107,Goblin Drill,63.2,0.2,+9.31,–0.08


In [8]:
df.info()

# all our numeric values are strings, we gotta convert those

<class 'pandas.core.frame.DataFrame'>
Index: 109 entries, 0 to 108
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Card             109 non-null    object
 1   Win rate         109 non-null    object
 2   Usage            109 non-null    object
 3   Win rate change  109 non-null    object
 4   Usage change     109 non-null    object
dtypes: object(5)
memory usage: 5.1+ KB


In [9]:
df['Win rate'] = df['Win rate'].astype(float) 
df['Usage'] = df['Usage'].astype(float) 

# cast all values in these columns to floats

def change_cleaner(value):
    if '+' in value:
        value = value[1:]
        return float(value)
    else:
        value = value[1:]
        return -abs(float(value))
    
df['Win rate change'] = df['Win rate change'].apply(change_cleaner)
df['Usage change'] = df['Usage change'].apply(change_cleaner)

# change values have positive and negative signs
# need to remove those and convert accordingly to floats
# Stats Royale uses dashes, not minus signs (– vs -) in the negative strings
# so we'd get an error if we tried to directly cast the negative strings to floats

In [10]:
df

Unnamed: 0,Card,Win rate,Usage,Win rate change,Usage change
0,The Log,62.7,36.3,4.48,-4.64
1,Goblin Barrel,64.7,32.9,4.71,0.72
2,Fireball,66.2,32.8,4.68,2.40
3,Mini P.E.K.K.A,67.8,32.0,4.41,3.84
4,Skeleton Army,64.8,31.7,4.09,-1.44
...,...,...,...,...,...
104,Heal Spirit,40.4,0.3,-10.41,-0.08
105,Three Musketeers,44.7,0.2,-6.40,0.08
106,Monk,48.2,0.2,-7.60,-0.24
107,Goblin Drill,63.2,0.2,9.31,-0.08


In [11]:
df.info()

# nice and clean

<class 'pandas.core.frame.DataFrame'>
Index: 109 entries, 0 to 108
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Card             109 non-null    object 
 1   Win rate         109 non-null    float64
 2   Usage            109 non-null    float64
 3   Win rate change  109 non-null    float64
 4   Usage change     109 non-null    float64
dtypes: float64(4), object(1)
memory usage: 5.1+ KB
