In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
import requests
from itertools import chain
from tqdm import tqdm_notebook as tqdm
import itertools
from functools import reduce

In [None]:
def modifyCols(df, drop_cols, rename_cols):
    df = df.drop(drop_cols, axis=1)
    rename = [col.split("_x")[0] for col in rename_cols]
    rename = dict(zip(rename_cols, rename))
    df = df.rename(columns = rename)
    return df

In [None]:
def preprocess_df(df):
  df.position = df.position.replace({"FW,MF": "MF,FW", "FW,DF":"DF,FW", "MF,DF":"DF,MF"})
  df.nationality = df.nationality.apply(lambda x: x.split(" ")[-1])
  df.comp_level = df.comp_level.apply(lambda x: ' '.join(x.split(" ")[1:]))
  non_numeric_cols = ['player', 'position', 'nationality', 'team', 'comp_level']

  for col in df.columns:
    if col not in non_numeric_cols:
        df[col] = df[col].apply(lambda x: float(x.replace(',', '')) if type(x)=="str" else x)
  minutes = df[["minutes_90s"]].iloc[:,1]
  del df["minutes_90s"]
  df["minutes_90"] = minutes
  return df

In [None]:
def persist_stats_for_a_year(year):
  # Get stats for each config keys. 
  # Config needs to be in the form of a dictionary where the keys are the url types and the values are url names{"standard": standard_url, "defense": defense_url}
  stats = []
  url_configs = create_url_configs(year)
  for url in url_configs.values():
    df = parse_page(url)
    stats.append(clean_columns(df))
  df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'birth_year'],
                                            how='inner'), stats).fillna(-1)
  drop_cols = list(set([col for col in df_merged.columns if (col.endswith("_y"))]))
  rename_cols = list(set([col for col in df_merged.columns if (col.endswith("_x"))]))
  df_modify = modifyCols(df_merged, drop_cols, rename_cols)
  file_name = "CURRENT"
  df_modify["year_of_stat"] = file_name
  if year is not None:
    file_name = year
  df_modify.to_csv(f'{file_name}.csv', index=False)


In [None]:
def create_url_configs(year:str):
  # Use the year to construct a dictionary of url configs for all the stats.
  config =  {
    "STANDARD_STATS_URL": "https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats" if year is None else f"https://fbref.com/en/comps/Big5/{year}/stats/players/{year}-Big-5-European-Leagues-Stats",
    "DEFENSE_STATS_URL": "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats" if year is None else f"https://fbref.com/en/comps/Big5/{year}/defense/players/{year}-Big-5-European-Leagues-Stats",
    "GCA_STATS_URL": "https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats" if year is None else f"https://fbref.com/en/comps/Big5/{year}/gca/players/{year}-Big-5-European-Leagues-Stats",
    "MISC_STATS_URL": "https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats" if year is None else f"https://fbref.com/en/comps/Big5/{year}/misc/players/{year}-Big-5-European-Leagues-Stats",
    "SHOOTING_STATS_URL": "https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats" if year is None else f"https://fbref.com/en/comps/Big5/{year}/shooting/players/{year}-Big-5-European-Leagues-Stats",
    "PASSING_STATS_URL" : "https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats" if year is None else f"https://fbref.com/en/comps/Big5/{year}/passing/players/{year}-Big-5-European-Leagues-Stats",
    "POSSESSION_STATS_URL" : "https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats" if year is None else f"https://fbref.com/en/comps/Big5/{year}/possession/players/{year}-Big-5-European-Leagues-Stats",
    "PLAYING_TIME_URL" : "https://fbref.com/en/comps/Big5/playingtime/players/Big-5-European-Leagues-Stats" if year is None else f"https://fbref.com/en/comps/Big5/{year}/playingtime/players/{year}-Big-5-European-Leagues-Stats"
  }
  return config

In [None]:

def append_names(feature_list, head_tuple):
    start = 0
    end = 0
    for head in head_tuple:
        end = end + int(head[1])
        if head[0] != '':
            feature_list[start:end] = [head[0]+"_"+x for x in feature_list[start:end]]
        start = end
    return feature_list

In [None]:
def clean_columns(df):
    df.columns = [col.replace("header_", "") if "header_" in col else col for col in df.columns ]
    if "matches" in list(df.columns):
        del df["matches"]
    return df

In [None]:
def parse_page(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.findAll('table')[0]
    features = [col.attrs["data-stat"] for col in table.find('thead').findAll('tr')[1].findAll('th')]
    features.remove("ranker")
    #features.remove("matches")
    header_name = [(col.attrs["data-stat"], col.attrs["colspan"]) for col in table.find('thead').findAll('tr')[0].findAll('th') if "data-stat" in col.attrs]
    features = append_names(features, header_name)
    player_table = table.find('tbody')
    #Parse player_table
    pre_df_player = []
    rows_player = player_table.find_all('tr')
    for row in rows_player:
        if(row.find('th',{"scope":"row"}) != None):
            # Not scraping unnecessary rows this way
            data = [x.text.strip().encode().decode("utf-8") for x in row.find_all('td')]
            pre_df_player.append(dict(zip(features, data)))
        '''
        
            for f in features:
                cell = row.find("td",{"data-stat": f})
                if cell is not None:
                    a = cell.text.strip().encode()
                    text=a.decode("utf-8")
                    if f in pre_df_player:
                        pre_df_player[f].append(text)
                    else:
                        pre_df_player[f] = [text]
                else:
                    print(f"couldn't get stat for {f}")
        '''
    df_player = pd.DataFrame(pre_df_player, columns=features)
    return df_player

In [None]:
years_to_scrape = [None, "2021-2022", "2020-2021", "2019-2020"]
for year in tqdm(years_to_scrape):
  persist_stats_for_a_year(year)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for year in tqdm(years_to_scrape):


  0%|          | 0/3 [00:00<?, ?it/s]

  df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'birth_year'],


In [None]:
persist_stats_for_a_year("2018-2019")

  df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'birth_year'],


REVIEW

In [None]:
data = pd.read_csv('/content/CURRENT.csv')

In [None]:
regular = data[data.playing_minutes_pct > 60]

In [None]:
midfielders = regular[regular.position.isin(["MF", "DF,MF"])]

creative_mid_stats = ["player", "playing_minutes_pct","per90_goals_assists_pens_per90", "expected_xa", "expected_xg_per90", "per90_xg", "per90_npxg_xa_per90", "pressures_pressures_mid_3rd", "pressures_pressures_att_3rd", "interceptions", "starts_minutes_per_start", "sca_sca_per90", "gca_gca_per90", "sca_types_sca_dribbles", "gca_types_gca_dribbles", "passes_total_passes_progressive_distance", "passes_medium_passes_pct_medium", "passes_long_passes_pct_long","passes_into_final_third", "passes_into_penalty_area", "crosses_into_penalty_area", "team_success_plus_minus_per90", "touches_touches_mid_3rd", "touches_touches_att_3rd", "touches_touches_att_pen_area", "dribbles_dribbles_completed_pct", "dribbles_nutmegs"]

cols = creative_mid_stats[:]
cols.remove("player")
creative_values = midfielders[cols].values

KeyError: ignored

In [None]:
list(midfielders.columns)

['player',
 'nationality',
 'position',
 'team',
 'comp_level',
 'age',
 'birth_year',
 'games',
 'playing_games_starts',
 'playing_minutes',
 'playing_minutes_90s',
 'playing_goals',
 'performance_assists',
 'performance_goals_assists',
 'performance_goals_pens',
 'performance_pens_made',
 'performance_pens_att',
 'performance_cards_yellow',
 'performance_cards_red',
 'performance_xg',
 'expected_npxg',
 'expected_xg_assist',
 'expected_npxg_xg_assist',
 'expected_progressive_carries',
 'progression_progressive_passes',
 'progression_progressive_passes_received',
 'progression_goals_per90',
 'per90_assists_per90',
 'per90_goals_assists_per90',
 'per90_goals_pens_per90',
 'per90_goals_assists_pens_per90',
 'per90_xg_per90',
 'per90_xg_assist_per90',
 'per90_xg_xg_assist_per90',
 'per90_npxg_per90',
 'per90_npxg_xg_assist_per90',
 'per90_matches',
 'minutes_90s',
 'tackles',
 'tackles_tackles_won',
 'tackles_tackles_def_3rd',
 'tackles_tackles_mid_3rd',
 'tackles_tackles_att_3rd',
 'tac