In [4]:
from google.colab import files
uploaded = files.upload()

Saving Player_Stats.html to Player_Stats.html


In [17]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

class EuropeanFootballAnalysis:
    def __init__(self, filename="Player_Stats.html"):
        self.file = filename
        self.raw_data = None
        self.cleaned_data = None

        self.position_map = {
            'GK': 'Goalkeeper',
            'DF': 'Defender',
            'MF': 'Midfielder',
            'FW': 'Forward'
        }

        self.colors = {
            'Premier League': '#38003c',
            'La Liga': '#eda132',
            'Bundesliga': '#d3010c',
            'Serie A': '#008fd8',
            'Ligue 1': '#091c3e'
        }

        self.team_colors = {
            'Manchester City': '#6CABDD',
            'Arsenal': '#EF0107',
            'Liverpool': '#C8102E',
            'Manchester United': '#DA291C',
            'Tottenham': '#FFFFFF',
            'Chelsea': '#034694',
            'Newcastle United': '#241F20',
            'Brighton': '#0057B8',
            'Aston Villa': '#95BFE5',
            'West Ham': '#7A263A',

            'Real Madrid': '#FEBE10',
            'Barcelona': '#A50044',
            'Atletico Madrid': '#CB3524',
            'Sevilla': '#FFFFFF',
            'Real Betis': '#00954C',
            'Villarreal': '#FFE667',
            'Real Sociedad': '#00529F',
            'Athletic Club': '#EE2523',

            'Inter': '#010E80',
            'Milan': '#FB090B',
            'Juventus': '#000000',
            'Napoli': '#12A0D7',
            'Roma': '#8E1F2F',
            'Lazio': '#FFFFFF',
            'Atalanta': '#1D5EA8',
            'Fiorentina': '#482E92',

            'Bayern Munich': '#0066B3',
            'Dortmund': '#FDE100',
            'RB Leipzig': '#DD0741',
            'Leverkusen': '#E32219',
            'Eintracht Frankfurt': '#E21A23',
            'Wolfsburg': '#65FF00',
            'Freiburg': '#E30613',
            "M'Gladbach": '#000000',

            'Paris S-G': '#004170',
            'Marseille': '#00B9F1',
            'Lyon': '#D61C28',
            'Monaco': '#E30613',
            'Lille': '#E21A23',
            'Rennes': '#E21A23',
            'Lens': '#FFDC00',
            'Nice': '#000000'
        }

    def scrape(self):
        with open(self.file, "r", encoding="utf-8") as f:
            html = f.read()

        page = BeautifulSoup(html, "lxml")

        table = None
        for t in page.find_all("table"):
            cap = t.find("caption")
            if cap and "Player Standard Stats" in cap.get_text():
                table = t
                break

        if table is None:
            raise ValueError("Player Standard Stats table not found")

        thead = table.find("thead")
        header_rows = thead.find_all("tr")
        headers = [th.get_text() for th in header_rows[-1].find_all(["th", "td"])]

        rows = []
        tbody = table.find("tbody")
        for tr in tbody.find_all("tr"):
            if tr.get("class") and "thead" in tr.get("class"):
                continue

            cells = [c.get_text() for c in tr.find_all(["th", "td"])]
            if not cells:
                continue

            if len(cells) > len(headers):
                cells = cells[-len(headers):]
            if len(cells) < len(headers):
                cells += [""] * (len(headers) - len(cells))

            rows.append(cells)

        self.raw_data = pd.DataFrame(rows, columns=headers)
        return self.raw_data

    def clean_data(self):
      if self.raw_data is None:
        self.scrape()
      df = self.raw_data.copy()

      df = df.loc[:, ~df.columns.duplicated()]
      df.replace("", np.nan, inplace=True)

      if "Nation" in df.columns:
        df["Nation"] = df["Nation"].astype(str).str.extract(r"([A-Z]{3})", expand=False)

      if "Comp" in df.columns:
        df["League"] = df["Comp"].astype(str).str.replace(r"^[a-z]{2}\s+", "", regex=True)

      if "Pos" in df.columns:
        df["Primary_Position"] = df["Pos"].astype(str).apply(self._extract_primary_position)

      if "Min" in df.columns:
        df["Min"] = pd.to_numeric(df["Min"].astype(str).str.replace(",", "", regex=False), errors="coerce")

      text_cols = ["Player", "Squad", "Comp", "League", "Pos", "Primary_Position", "Nation"]

      for c in df.columns:
        if c not in text_cols:
            df[c] = pd.to_numeric(df[c], errors="coerce")

      df.fillna(0, inplace=True)

      self.cleaned_data = df
      return self.cleaned_data



    def _extract_primary_position(self, pos_str):
        s = str(pos_str)
        if "," in s:
          s = s.split(",")[0]
        for p in ["GK", "DF", "MF", "FW"]:
          if s.startswith(p):
            return p
        return "Unknown"

    def add_derived_metrics(self):
        df = self.cleaned_data.copy()
        df["Minutes_per_Game"] = np.where(df["MP"] > 0, df["Min"] / df["MP"], 0)

        if "G+A" in df.columns:
            ga = pd.to_numeric(df["G+A"], errors="coerce").fillna(0)
        else:
            ga = pd.to_numeric(df.get("Gls", 0), errors="coerce").fillna(0) + pd.to_numeric(df.get("Ast", 0), errors="coerce").fillna(0)

        df["Goal_Contribution_Rate"] = np.where(df["90s"] > 0, ga / df["90s"], 0)
        self.cleaned_data = df
        return self.cleaned_data

    def find_top_scorer(self):
        df = self.cleaned_data
        m = df["Gls"].max()
        return df.loc[df["Gls"] == m, ["Player", "Squad", "Gls", "League"]]

    def find_playmaker(self):
        df = self.cleaned_data
        m = df["Ast"].max()
        return df.loc[df["Ast"] == m, ["Player", "Squad", "Ast", "League"]]

    def find_ironman(self):
        df = self.cleaned_data
        m = df["Min"].max()
        return df.loc[df["Min"] == m, ["Player", "Squad", "Min", "MP"]]

    def find_efficient_striker(self):
        df = self.cleaned_data
        d = df[(df["Primary_Position"] == "FW") & (df["Min"] > 1000)].copy()
        d["Goals_per_Minute"] = np.where(d["Min"] > 0, d["Gls"] / d["Min"], 0)
        m = d["Goals_per_Minute"].max()
        return d.loc[d["Goals_per_Minute"] == m, ["Player", "Squad", "Gls", "Min", "Goals_per_Minute"]]

    def find_most_disciplined(self):
        df = self.cleaned_data
        d = df[df["Min"] > 1000].copy()
        d["Discipline_Score"] = np.where(d["90s"] > 0, (d.get("CrdY", 0) + d.get("CrdR", 0)) / d["90s"], 0)
        m = d["Discipline_Score"].max()
        return d.loc[d["Discipline_Score"] == m, ["Player", "Squad", "CrdY", "CrdR", "Discipline_Score", "Primary_Position"]]

    def data_quality_report(self):
      df = self.cleaned_data if self.cleaned_data is not None else self.raw_data
      missing = df.isna().sum()
      dp_players = df["Player"].value_counts()
      dp_players = dp_players[dp_players > 1]
      return missing, dp_players

    def handle_transfers(self):
      df = self.cleaned_data.copy()

      transferred = df[df["Player"].duplicated(keep=False)]
      sum_cols = ["Min", "MP", "Starts", "Gls", "Ast", "G+A", "CrdY", "CrdR"]
      sum_cols = [c for c in sum_cols if c in df.columns]

      rate_cols = [c for c in df.columns if "/90" in c]

      agg = {c: "sum" for c in sum_cols}
      agg.update({c: "mean" for c in rate_cols})
      agg.update({c: "first" for c in df.columns if c not in agg})

      result = transferred.groupby("Player", as_index=False).agg(agg)
      return result


In [9]:
class1 = EuropeanFootballAnalysis()
class1.scrape()
df = class1.clean_data()

df[["Pos", "Primary_Position"]].head(50)


Unnamed: 0,Pos,Primary_Position
0,DF,DF
1,"MF,FW",MF
2,MF,MF
3,FW,FW
4,DF,DF
5,MF,MF
6,DF,DF
7,MF,MF
8,"FW,MF",FW
9,DF,DF


In [None]:
pd.set_option("display.max_columns", None)
class1 = EuropeanFootballAnalysis()
class1.scrape()
df = class1.clean_data()
df.head(100)



Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,G+A-PK,Matches,League,Primary_Position
0,1,Max Aarons,ENG,DF,Bournemouth,eng Premier League,23,2000,20,13.0,1237,13.7,0.0,1,1,0,0,0,1,0,0.07,0.0,eng Premier League,DF
1,2,Brenden Aaronson,USA,"MF,FW",Union Berlin,de Bundesliga,22,2000,30,14.0,1267,14.1,2.0,2,4,2,0,0,3,1,0.28,0.0,Bundesliga,MF
2,3,Paxten Aaronson,USA,MF,Eintracht Frankfurt,de Bundesliga,19,2003,7,1.0,101,1.1,0.0,1,1,0,0,0,0,0,0.89,0.0,Bundesliga,MF
3,4,Keyliane Abdallah,FRA,FW,Marseille,fr Ligue 1,17,2006,1,0.0,4,0.0,0.0,0,0,0,0,0,0,0,0.00,0.0,Ligue 1,FW
4,5,Yunis Abdelhamid,MAR,DF,Reims,fr Ligue 1,35,1987,31,31.0,2781,30.9,4.0,0,4,3,1,1,5,0,0.10,0.0,Ligue 1,DF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,Zeki Amdouni,SUI,FW,Burnley,eng Premier League,22,2000,34,27.0,1953,21.7,5.0,1,6,4,1,1,2,0,0.23,0.0,eng Premier League,FW
96,97,Kelvin Amian,FRA,DF,Nantes,fr Ligue 1,25,1998,12,11.0,798,8.9,0.0,1,1,0,0,0,2,0,0.11,0.0,Ligue 1,DF
97,98,Bruno Amione,ARG,DF,Hellas Verona,it Serie A,21,2002,10,7.0,579,6.4,0.0,0,0,0,0,0,2,0,0.00,0.0,Serie A,DF
98,99,Michael Amir Murillo,PAN,"DF,MF",Marseille,fr Ligue 1,27,1996,16,10.0,997,11.1,3.0,2,5,3,0,0,3,0,0.45,0.0,Ligue 1,DF


In [None]:
pd.set_option("display.max_columns", None)

pd.set_option("display.max_rows", None)
df


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,G+A-PK,Matches,League,Primary_Position
0,1,Max Aarons,ENG,DF,Bournemouth,eng Premier League,23,2000,20,13.0,1237,13.7,0.0,1,1,0,0,0,1,0,0.07,0.0,eng Premier League,DF
1,2,Brenden Aaronson,USA,"MF,FW",Union Berlin,de Bundesliga,22,2000,30,14.0,1267,14.1,2.0,2,4,2,0,0,3,1,0.28,0.0,Bundesliga,MF
2,3,Paxten Aaronson,USA,MF,Eintracht Frankfurt,de Bundesliga,19,2003,7,1.0,101,1.1,0.0,1,1,0,0,0,0,0,0.89,0.0,Bundesliga,MF
3,4,Keyliane Abdallah,FRA,FW,Marseille,fr Ligue 1,17,2006,1,0.0,4,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,Ligue 1,FW
4,5,Yunis Abdelhamid,MAR,DF,Reims,fr Ligue 1,35,1987,31,31.0,2781,30.9,4.0,0,4,3,1,1,5,0,0.1,0.0,Ligue 1,DF
5,6,Salis Abdul Samed,GHA,MF,Lens,fr Ligue 1,23,2000,27,17.0,1519,16.9,0.0,0,0,0,0,0,2,0,0.0,0.0,Ligue 1,MF
6,7,Nabil Aberdin,FRA,DF,Getafe,es La Liga,20,2002,2,2.0,180,2.0,0.0,0,0,0,0,0,0,0,0.0,0.0,La Liga,DF
7,8,Laurent Abergel,FRA,MF,Lorient,fr Ligue 1,30,1993,33,32.0,2860,31.8,2.0,1,3,2,0,0,4,0,0.09,0.0,Ligue 1,MF
8,9,Matthis Abline,FRA,"FW,MF",Nantes,fr Ligue 1,20,2003,22,12.0,1044,11.6,5.0,0,5,5,0,0,1,0,0.43,0.0,Ligue 1,MF
9,10,Abner,BRA,DF,Real Betis,es La Liga,23,2000,23,15.0,1400,15.6,0.0,1,1,0,0,0,3,0,0.06,0.0,La Liga,DF


In [None]:
df[df.isna().any(axis=1)]


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,G+A-PK,Matches,League,Primary_Position


In [10]:
class1 = EuropeanFootballAnalysis()

class1.scrape()          # load raw_data
class1.clean_data()      # clean + prepare data
class1.add_derived_metrics()  # add calculated columns


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,PK,PKatt,CrdY,CrdR,G+A-PK,Matches,League,Primary_Position,Minutes_per_Game,Goal_Contribution_Rate
0,1,Max Aarons,ENG,DF,Bournemouth,eng Premier League,23,2000,20,13.0,...,0,0,1,0,0.07,0.0,eng Premier League,DF,61.850000,0.072993
1,2,Brenden Aaronson,USA,"MF,FW",Union Berlin,de Bundesliga,22,2000,30,14.0,...,0,0,3,1,0.28,0.0,Bundesliga,MF,42.233333,0.283688
2,3,Paxten Aaronson,USA,MF,Eintracht Frankfurt,de Bundesliga,19,2003,7,1.0,...,0,0,0,0,0.89,0.0,Bundesliga,MF,14.428571,0.909091
3,4,Keyliane Abdallah,FRA,FW,Marseille,fr Ligue 1,17,2006,1,0.0,...,0,0,0,0,0.00,0.0,Ligue 1,FW,4.000000,0.000000
4,5,Yunis Abdelhamid,MAR,DF,Reims,fr Ligue 1,35,1987,31,31.0,...,1,1,5,0,0.10,0.0,Ligue 1,DF,89.709677,0.129450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2847,2848,Lovro Zvonarek,CRO,MF,Bayern Munich,de Bundesliga,18,2005,5,1.0,...,0,0,0,0,0.55,0.0,Bundesliga,MF,32.600000,0.555556
2848,2849,Martin Ødegaard,NOR,MF,Arsenal,eng Premier League,24,1998,35,35.0,...,2,2,2,0,0.47,0.0,eng Premier League,MF,88.314286,0.524781
2849,2850,Milan Đurić,BIH,FW,Hellas Verona,it Serie A,33,1990,20,13.0,...,1,3,2,0,0.37,0.0,Serie A,FW,60.200000,0.447761
2850,2851,Milan Đurić,BIH,FW,Monza,it Serie A,33,1990,17,13.0,...,0,0,2,0,0.36,0.0,Serie A,FW,73.941176,0.357143


In [11]:
class1.find_top_scorer()

Unnamed: 0,Player,Squad,Gls,League
1294,Harry Kane,Bayern Munich,36.0,Bundesliga


In [12]:
class1.find_playmaker()

Unnamed: 0,Player,Squad,Ast,League
179,Alex Baena,Villarreal,14,La Liga


In [13]:
class1.find_ironman()

Unnamed: 0,Player,Squad,Min,MP
840,Wladimiro Falcone,Lecce,3420,38
955,Paulo Gazzaniga,Girona,3420,38
1334,Max Kilman,Wolves,3420,38
1464,Bernd Leno,Fulham,3420,38
1937,Jan Oblak,Atlético Madrid,3420,38
1971,André Onana,Manchester Utd,3420,38
2121,Jordan Pickford,Everton,3420,38
2317,William Saliba,Arsenal,3420,38
2695,Guglielmo Vicario,Tottenham Hotspur,3420,38


In [14]:
class1.find_efficient_striker()

Unnamed: 0,Player,Squad,Gls,Min,Goals_per_Minute
1061,Serhou Guirassy,Stuttgart,28.0,2208,0.012681


In [15]:
class1.find_most_disciplined()

Unnamed: 0,Player,Squad,CrdY,CrdR,Discipline_Score,Primary_Position
66,Iván Alejo,Cádiz,17,0,0.904255,MF


In [18]:
class1 = EuropeanFootballAnalysis()
class1.scrape()
class1.clean_data()

transfers = class1.handle_transfers()
transfers.head()

Unnamed: 0,Min,MP,Starts,Gls,Ast,G+A,CrdY,CrdR,Rk,Player,...,Age,Born,90s,G-PK,PK,PKatt,G+A-PK,Matches,League,Primary_Position
0,933,28,6.0,1.0,0,1,1,0,832,Abde Ezzalzouli,...,21,2001,0.8,0,0,0,0.0,0.0,La Liga,FW
1,1088,20,11.0,0.0,0,0,2,0,1649,Adam Masina,...,29,1994,0.7,0,0,0,0.0,0.0,Serie A,MF
2,1005,16,13.0,0.0,2,2,0,0,2715,Alan Virginius,...,20,2003,0.8,0,0,0,0.0,0.0,Ligue 1,MF
3,85,14,0.0,1.0,0,1,1,0,2684,Alejo Véliz,...,19,2003,0.6,1,0,0,1.8,0.0,eng Premier League,FW
4,973,21,10.0,0.0,2,2,4,0,2815,Alessandro Zanoli,...,22,2000,10.2,0,0,0,0.2,0.0,Serie A,MF


In [19]:
missing, dup_players = class1.data_quality_report()
missing, dup_players.head()

(Rk                  0
 Player              0
 Nation              0
 Pos                 0
 Squad               0
 Comp                0
 Age                 0
 Born                0
 MP                  0
 Starts              0
 Min                 0
 90s                 0
 Gls                 0
 Ast                 0
 G+A                 0
 G-PK                0
 PK                  0
 PKatt               0
 CrdY                0
 CrdR                0
 G+A-PK              0
 Matches             0
 League              0
 Primary_Position    0
 dtype: int64,
 Player
 Vitinha            3
 Sergio Reguilón    2
 Stefan Mitrović    2
 Fernando           2
 Adam Masina        2
 Name: count, dtype: int64)