In [68]:
import json
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np

In [75]:
with open(".json", "r") as file:
    source_dict = json.load(file)

SOURCE_URL, STATISTICS = source_dict["source_url"], source_dict["statistics"]

In [76]:
url, columns = STATISTICS["standard"]["url"], STATISTICS["standard"]["columns"]
response = requests.get(SOURCE_URL.format(url))
html = BeautifulSoup(response.text, features="html.parser")

stats_div = html.find("body").find(id="wrap").find(id="content").find(id="all_stats_standard")
comment = stats_div.find(string=lambda text: isinstance(text, Comment))
table = BeautifulSoup(comment, features="html.parser").find(id="div_stats_standard").find("table")

standard_stats_list = []
tr_list = table.find("tbody").find_all("tr")
tr_list = [tr for tr in tr_list if not tr.has_attr("class")]
for tr in tr_list:
    player_stats = {td.get("data-stat"): td.text for td in tr.find_all("td") if td.get("data-stat") in columns}
    standard_stats_list.append(player_stats)

standard_stats = pd.DataFrame(standard_stats_list)
standard_stats = standard_stats.set_index(["player", "team"])
standard_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,nationality,position,age,birth_year,minutes,goals_assists,goals_pens,npxg_xg_assist,goals_per90,assists_per90,goals_assists_per90,goals_pens_per90,goals_assists_pens_per90,xg_per90,xg_assist_per90,xg_xg_assist_per90,npxg_per90,npxg_xg_assist_per90
player,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Max Aarons,Bournemouth,eng ENG,DF,23,2000,1237,1,0,0.9,0.00,0.07,0.07,0.00,0.07,0.00,0.06,0.06,0.00,0.06
Joshua Acheampong,Chelsea,eng ENG,DF,17,2006,6,0,0,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
Bénie Adama Traore,Sheffield Utd,ci CIV,"FW,MF",20,2002,387,0,0,0.8,0.00,0.00,0.00,0.00,0.00,0.06,0.13,0.19,0.06,0.19
Tyler Adams,Bournemouth,us USA,MF,24,1999,121,0,0,0.1,0.00,0.00,0.00,0.00,0.00,0.00,0.06,0.06,0.00,0.06
Tosin Adarabioyo,Fulham,eng ENG,DF,25,1997,1617,2,2,0.8,0.11,0.00,0.11,0.11,0.11,0.04,0.01,0.05,0.04,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Nicolò Zaniolo,Aston Villa,it ITA,"FW,MF",24,1999,839,2,2,3.6,0.21,0.00,0.21,0.21,0.21,0.28,0.11,0.39,0.28,0.39
Anass Zaroury,Burnley,ma MAR,"FW,MF",22,2000,152,0,0,0.2,0.00,0.00,0.00,0.00,0.00,0.10,0.02,0.12,0.10,0.12
Oleksandr Zinchenko,Arsenal,ua UKR,DF,26,1996,1722,3,1,3.1,0.05,0.10,0.16,0.05,0.16,0.03,0.13,0.16,0.03,0.16
Kurt Zouma,West Ham,fr FRA,DF,28,1994,2838,3,3,2.0,0.10,0.00,0.10,0.10,0.10,0.06,0.01,0.06,0.06,0.06


In [77]:
standard_stats["nationality"] = standard_stats["nationality"].str.replace(r"[a-z]", "", regex=True)
standard_stats["minutes"] = standard_stats["minutes"].str.replace(",", "")
standard_stats[["nationality", "minutes"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,nationality,minutes
player,team,Unnamed: 2_level_1,Unnamed: 3_level_1
Max Aarons,Bournemouth,ENG,1237
Joshua Acheampong,Chelsea,ENG,6
Bénie Adama Traore,Sheffield Utd,CIV,387
Tyler Adams,Bournemouth,USA,121
Tosin Adarabioyo,Fulham,ENG,1617
...,...,...,...
Nicolò Zaniolo,Aston Villa,ITA,839
Anass Zaroury,Burnley,MAR,152
Oleksandr Zinchenko,Arsenal,UKR,1722
Kurt Zouma,West Ham,FRA,2838


In [78]:
numerical_columns = standard_stats.columns.difference(["nationality", "position"])
standard_stats[numerical_columns] = standard_stats[numerical_columns].astype(float)
standard_stats.dtypes

nationality                  object
position                     object
age                         float64
birth_year                  float64
minutes                     float64
goals_assists               float64
goals_pens                  float64
npxg_xg_assist              float64
goals_per90                 float64
assists_per90               float64
goals_assists_per90         float64
goals_pens_per90            float64
goals_assists_pens_per90    float64
xg_per90                    float64
xg_assist_per90             float64
xg_xg_assist_per90          float64
npxg_per90                  float64
npxg_xg_assist_per90        float64
dtype: object

In [79]:
standard_stats = standard_stats[standard_stats["minutes"] >= 90]
standard_stats.shape

(494, 18)

In [81]:
url, columns = STATISTICS["goal_keeping"]["url"], STATISTICS["goal_keeping"]["columns"]
response = requests.get(SOURCE_URL.format(url))
html = BeautifulSoup(response.text, features="html.parser")

stats_div = html.find("body").find(id="wrap").find(id="content").find(id="all_stats_keeper")
comment = stats_div.find(string=lambda text: isinstance(text, Comment))
table = BeautifulSoup(comment, features="html.parser").find(id="div_stats_keeper").find("table")

keeper_stats_list = []
tr_list = table.find("tbody").find_all("tr")
tr_list = [tr for tr in tr_list if not tr.has_attr("class")]
for tr in tr_list:
    player_stats = {td.get("data-stat"): td.text for td in tr.find_all("td") if td.get("data-stat") in columns}
    keeper_stats_list.append(player_stats)

keeper_stats = pd.DataFrame(keeper_stats_list)
keeper_stats = keeper_stats.set_index(["player", "team"])
print(keeper_stats.shape)
keeper_stats.head()

(40, 16)


Unnamed: 0_level_0,Unnamed: 1_level_0,gk_games,gk_goals_against,gk_goals_against_per90,gk_shots_on_target_against,gk_saves,gk_save_pct,gk_wins,gk_ties,gk_losses,gk_clean_sheets,gk_clean_sheets_pct,gk_pens_att,gk_pens_allowed,gk_pens_saved,gk_pens_missed,gk_pens_save_pct
player,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Alisson,Liverpool,28,30,1.07,109,80,73.4,16,8,4,8,28.6,1,1,0,0,0.0
Alphonse Areola,West Ham,31,53,1.77,188,136,74.5,11,9,11,4,12.9,7,5,2,0,28.6
Daniel Bentley,Wolves,5,7,1.64,15,8,53.3,1,0,2,1,33.3,0,0,0,0,
Martin Dúbravka,Newcastle Utd,23,42,1.9,130,87,70.8,9,4,9,5,22.7,5,4,1,0,20.0
Ederson,Manchester City,33,27,0.87,82,56,70.7,24,6,3,10,30.3,3,3,0,0,0.0


In [82]:
keeper_stats = keeper_stats.replace("", np.nan)
keeper_stats = keeper_stats.astype(float)
keeper_stats.dtypes

gk_games                      float64
gk_goals_against              float64
gk_goals_against_per90        float64
gk_shots_on_target_against    float64
gk_saves                      float64
gk_save_pct                   float64
gk_wins                       float64
gk_ties                       float64
gk_losses                     float64
gk_clean_sheets               float64
gk_clean_sheets_pct           float64
gk_pens_att                   float64
gk_pens_allowed               float64
gk_pens_saved                 float64
gk_pens_missed                float64
gk_pens_save_pct              float64
dtype: object

In [83]:
stats = pd.merge(standard_stats, keeper_stats, on=["player", "team"], how="left")
stats.shape

(494, 34)