# HTML parsing with Beautiful Soup: A Mini Project
 * scrape data from a public website
 * organize the data as a dataframe
 * export the data as an excel sheet

In [72]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [12]:
# GET request
url= "https://www.bbc.com/sport/football/premier-league/top-scorers"


In [21]:
try:
    response = requests.get(url)
    response.raise_for_status()
except Exception as e:
    print(e)
else:
    soup = BeautifulSoup(response.content,'html.parser')
    print(soup.find('tbody').find_all('tr', class_="ssrcss-dhlz6k-TableRowBody e1icz100"))

[<tr class="ssrcss-dhlz6k-TableRowBody e1icz100"><td class="ssrcss-fvkmzs-StyledTableData ef9ipf1"><div class="ssrcss-qg0qvg-CellWrapper ef9ipf0" value="1"><div class="ssrcss-wjp01q-RankAndBadgeWrapper e1n6h4mf2"><span class="ssrcss-1xljqjf-Rank e1n6h4mf1">1</span><div class="ssrcss-833zc8-BadgeWrapper e1n6h4mf0"><div class="ssrcss-bdya4g-MobileOnlyWrapper e1v3gc7i1"><div class="ssrcss-1ij97kg-BadgeContainer ezmsq4q1" data-testid="badge-container-liverpool" size="20"><img alt="" aria-hidden="true" class="ssrcss-1knyx38-BadgeImage ezmsq4q0" data-testid="badge-img-liverpool" src="https://static.files.bbci.co.uk/core/website/assets/static/sport/football/liverpool.0d2ced3f9a.svg"/></div></div><div class="ssrcss-dngukv-DesktopOnlyWrapper e1v3gc7i0"><div class="ssrcss-1b9x5pa-BadgeContainer ezmsq4q1" data-testid="badge-container-liverpool" size="24"><img alt="" aria-hidden="true" class="ssrcss-1knyx38-BadgeImage ezmsq4q0" data-testid="badge-img-liverpool" src="https://static.files.bbci.co.uk

In [22]:
print(len(soup.find('tbody').find_all('tr', class_="ssrcss-dhlz6k-TableRowBody e1icz100")))

26


In [77]:
players = soup.find('tbody').find_all('tr', class_="ssrcss-dhlz6k-TableRowBody e1icz100")

# Initialize lists
player_names, team_names, goals, assists, num_matches, shots = [], [], [], [], [], []

for player in players:
    try:
        player_name = player.find('div', class_='ssrcss-m6ah29-PlayerName e1n8xy5b1').get_text(strip=True)
        team_name = player.find('div', class_="ssrcss-qvpga1-TeamsSummary e1n8xy5b0").get_text(strip=True)
        goals_score = int(player.find('div', class_='ssrcss-8k20kk-CellWrapper ef9ipf0').get_text(strip=True))

        stats = player.find_all('div', class_='ssrcss-150z8d-CellWrapper ef9ipf0') 
        assists_made = int(stats[0].get_text(strip=True))
        matches_played = int(stats[2].get_text(strip=True))
        shots_taken = int(stats[-3].get_text(strip=True))

        # Append to lists
        player_names.append(player_name)
        team_names.append(team_name)
        goals.append(goals_score)
        assists.append(assists_made)
        num_matches.append(matches_played)
        shots.append(shots_taken)

    except Exception as e:
        print(f" Skipped a row due to error: {e}")

# Create DataFrame
data = {
    'player': player_names,
    'team': team_names,
    'matches': num_matches,
    'goals': goals,
    'assists': assists,
    'shots': shots
}

df_players = pd.DataFrame(data)


In [79]:
df_players

Unnamed: 0,player,team,matches,goals,assists,shots
0,Mohamed Salah,Liverpool,30,27,17,107
1,E. Haaland,Man City,28,21,3,102
2,A. Isak,Newcastle,26,20,5,72
3,C. Wood,Nottm Forest,29,18,3,53
4,B. Mbeumo,Brentford,30,16,5,65
5,C. Palmer,Chelsea,29,14,7,104
6,Y. Wissa,Brentford,27,14,2,66
7,O. Watkins,Aston Villa,31,13,6,72
8,Matheus Cunha,Wolves,26,13,4,86
9,J. Mateta,Crystal Palace,29,13,2,57


In [80]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   player   26 non-null     object
 1   team     26 non-null     object
 2   matches  26 non-null     int64 
 3   goals    26 non-null     int64 
 4   assists  26 non-null     int64 
 5   shots    26 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 1.3+ KB


In [81]:
df_players.describe()

Unnamed: 0,matches,goals,assists,shots
count,26.0,26.0,26.0,26.0
mean,27.576923,12.153846,4.615385,64.153846
std,2.435949,4.847204,3.200961,19.27214
min,21.0,8.0,0.0,38.0
25%,26.25,9.0,3.0,49.75
50%,28.0,10.5,4.5,58.5
75%,29.0,13.75,5.75,72.0
max,31.0,27.0,17.0,107.0


In [85]:
df_players.to_excel('EPL Top Scorer.xlsx' , index=False)

In [78]:
print(len(player_names), len(team_names), len(goals), len(assists), len(num_matches), len(shots))


26 26 26 26 26 26


In [61]:
goals

[27,
 21,
 20,
 18,
 16,
 14,
 14,
 13,
 13,
 13,
 12,
 12,
 11,
 10,
 9,
 9,
 9,
 9,
 9,
 9,
 8,
 8,
 8,
 8,
 8,
 8]

In [57]:
team_names

['Liverpool',
 'Man City',
 'Newcastle',
 'Nottm Forest',
 'Brentford',
 'Chelsea',
 'Brentford',
 'Aston Villa',
 'Wolves',
 'Crystal Palace',
 'Bournemouth',
 'Ipswich',
 'Wolves',
 'Fulham',
 'Tottenham',
 'Chelsea',
 'Liverpool',
 'Arsenal',
 'Tottenham',
 'Bournemouth',
 'Man Utd',
 'Brighton',
 'Aston Villa',
 'West Ham',
 'Brighton',
 'Liverpool']

In [44]:
player_names

['Mohamed Salah',
 'E. Haaland',
 'A. Isak',
 'C. Wood',
 'B. Mbeumo',
 'C. Palmer',
 'Y. Wissa',
 'O. Watkins',
 'Matheus Cunha',
 'J. Mateta',
 'J. Kluivert',
 'L. Delap',
 'J. Strand Larsen',
 'R. Jiménez',
 'J. Maddison',
 'N. Jackson',
 'L. Díaz',
 'K. Havertz',
 'B. Johnson',
 'Evanilson',
 'Bruno Fernandes',
 'João Pedro',
 'M. Rogers',
 'J. Bowen',
 'D. Welbeck',
 'C. Gakpo']

In [69]:
assists

[17, 3, 5, 3, 5, 7, 2, 6, 4, 2, 6, 2, 2, 3, 5, 5, 5, 3, 2, 0, 9, 6, 6, 5, 4, 3]

In [70]:
num_matches

[30,
 28,
 26,
 29,
 30,
 29,
 27,
 31,
 26,
 29,
 28,
 30,
 29,
 30,
 28,
 24,
 29,
 21,
 27,
 24,
 29,
 25,
 30,
 27,
 24,
 27]

In [71]:
shots

[107,
 102,
 72,
 53,
 65,
 104,
 66,
 72,
 86,
 57,
 56,
 60,
 44,
 79,
 38,
 65,
 57,
 52,
 49,
 56,
 78,
 43,
 48,
 71,
 44,
 44]