In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from functools import reduce

In [3]:
def get_data(data_url):
    data_url = str(data_url)
    data_page = requests.get(data_url)
    data_soup = BeautifulSoup(data_page.text, "html")

    data_table = data_soup.find("table", class_="min_width sortable stats_table min_width shade_zero")
    data_unfilt_cols = data_table.find_all("th")

    front,back = 0,0
    for index, header in enumerate(data_unfilt_cols):
        if header.get_text() == "Player":
            front = index
        if header.get_text() == "Matches":
            back = index + 1
        if front and back:
            break
    
    data_cols = [title.text.strip() for title in data_unfilt_cols][front:back] 
    data_df = pd.DataFrame(columns=data_cols)

    data_col_data_draft = data_table.find_all("tr")
    data_col_data = []
    i = 2
    while i < len(data_col_data_draft):
        data_col_data.extend(data_col_data_draft[i:i+25])
        i += 25 + 1
    for data_row in data_col_data:
        data_unfil_row = data_row.find_all("td") 
        data_rows = [title.text.strip() for title in data_unfil_row] 
        data_length = len(data_df)
        data_df.loc[data_length] = data_rows


    data_numeric_cols = data_df.columns[5:len(data_df.columns)-1]
    for col in data_numeric_cols:
        if col == "Born":
            data_df[col] = pd.to_datetime(data_df[col],format="%Y").dt.year
        else:
            data_df[col] = data_df[col].apply(pd.to_numeric)
            data_df[col] = data_df[col].fillna(0)

    data_df.drop(columns="Matches", inplace=True)

    data_df["Nation"] = data_df["Nation"].apply(lambda nation: nation.split()[1] if isinstance(nation, str) and len(nation.split()) > 0 else None)
    data_df["Comp"] = data_df["Comp"].apply(lambda comp: " ".join(comp.split()[1:]) if isinstance(comp, str) and len(comp.split()) > 0 else None)

    data_df.reset_index(drop=True, inplace=True)

    return data_df

In [4]:
def_df = get_data("https://fbref.com/en/comps/Big5/2023-2024/defense/players/2023-2024-Big-5-European-Leagues-Stats")
def_df.columns = ["Player", "Nation", "Pos", "Squad",'Comp', "Age", "Born", "90s", 
                  "D_Tkl", "D_TklW", "Tkl_Def 3rd", "Tkl_Mid 3rd", "Tkl_Att 3rd", 
                  "Chal_Tkl", "Chal_Att", "Chal_Tkl%", "Chal_Lost", 
                  "Blocks", "B_Sh", "B_Pass", "Int", "Tkl+Int", "Clr", "Err"]

In [5]:
pass_df = get_data("https://fbref.com/en/comps/Big5/2023-2024/passing/players/2023-2024-Big-5-European-Leagues-Stats")
pass_df.columns = ['Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s', 
              'Cmp_Total', 'Att_Total', 'Cmp%_Total', 'TotDist', 'PrgDist', 
              'Cmp_Short', 'Att_Short', 'Cmp%_Short', 
              'Cmp_Medium', 'Att_Medium', 'Cmp%_Medium', 
              'Cmp_Long', 'Att_Long', 'Cmp%_Long', 
              'Ast', 'xAG', 'xA', 'A-xAG', 'KP', '1/3', 'PPA', 'CrsPA', 'PrgP']

In [6]:
shot_df = get_data("https://fbref.com/en/comps/Big5/2023-2024/shooting/players/2023-2024-Big-5-European-Leagues-Stats")

In [7]:
poss_df = get_data("https://fbref.com/en/comps/Big5/2023-2024/possession/players/2023-2024-Big-5-European-Leagues-Stats")
poss_df.columns = ['Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s',
                   'Touches', 'T_Def Pen', 'T_Def 3rd', 'T_Mid 3rd', 'T_Att 3rd', 'T_Att Pen', 'T_Live', 
                   'Drib_Att', 'Drib_Succ', 'Drib_Succ%', 'Drib_Tkld', 'Drib_Tkld%',
                   'Carries', 'Carry_TotDist', 'Carry_PrgDist', 'Carry_PrgNum','Carry_1/3','Carry_PA','Carry_Miscon', 'Carry_Dispos', 
                   'Rec', 'Rec_Prg']

In [8]:
gsc_df = get_data("https://fbref.com/en/comps/Big5/2023-2024/gca/players/2023-2024-Big-5-European-Leagues-Stats")
gsc_df.columns = ['Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s',
                  'SCA', 'SCA90', 'SCA_PassLive', 'SCA_PassDead', 'SCA_Drib', 'SCA_Sh', 'SCA_Fld', 'SCA_Def', 
                  'GCA', 'GCA90', 'GCA_PassLive', 'GCA_PassDead', 'GCA_Drib', 'GCA_Sh', 'GCA_Fld', 'GCA_Def']

In [9]:
dfs = [shot_df, pass_df,gsc_df, poss_df,def_df]
df_final = reduce(lambda left,right: pd.merge(left,right,on=['Player','Nation','Pos','Squad','Comp','Age','Born','90s']), dfs)

In [10]:
df_final

Unnamed: 0,Player,Nation,Pos,Squad,Comp,Age,Born,90s,Gls,Sh,...,Chal_Att,Chal_Tkl%,Chal_Lost,Blocks,B_Sh,B_Pass,Int,Tkl+Int,Clr,Err
0,Max Aarons,ENG,DF,Bournemouth,Premier League,23.0,2000.0,13.7,0,2,...,34,58.8,14,9,5,4,8,37,27,0
1,Brenden Aaronson,USA,"MF,FW",Union Berlin,Bundesliga,22.0,2000.0,14.1,2,18,...,32,50.0,16,26,1,25,2,34,4,0
2,Paxten Aaronson,USA,MF,Eint Frankfurt,Bundesliga,19.0,2003.0,1.1,0,2,...,1,100.0,0,2,0,2,0,2,0,0
3,Keyliane Abdallah,FRA,FW,Marseille,Ligue 1,17.0,2006.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,0,0
4,Yunis Abdelhamid,MAR,DF,Reims,Ligue 1,35.0,1987.0,30.9,4,21,...,45,57.8,19,51,32,19,39,103,109,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2847,Lovro Zvonarek,CRO,"FW,MF",Bayern Munich,Bundesliga,18.0,2005.0,1.8,1,1,...,5,0.0,5,3,0,3,0,0,2,0
2848,Martin Ødegaard,NOR,MF,Arsenal,Premier League,24.0,1998.0,34.3,8,75,...,58,29.3,41,23,1,22,15,64,5,0
2849,Milan Đurić,BIH,FW,Hellas Verona,Serie A,33.0,1990.0,13.4,5,22,...,3,0.0,3,0,0,0,1,2,13,0
2850,Milan Đurić,BIH,FW,Monza,Serie A,33.0,1990.0,14.0,4,26,...,4,50.0,2,6,2,4,0,5,27,0


In [11]:
SA = ['BRA','CHI','COL','VEN','PER','ARG','URU','PAR','ECU','BOL','GUY',"SUR"]
EU = ['ESP','GER','ENG','ITA','FRA','POR','NED','BEL','RUS','TUR','UKR','SCO','GRE','SUI','CRO','SRB','CZE','POL','SWE','DEN','NOR','AUT','ROU','HUN','SVK','BUL','FIN','BIH','IRL','WAL','ALB','MKD','ISL','LUX','MLT','EST','LIE','AND','GIB','FRO','ARM','AZE','BLR','CYP','GEO','KAZ','KOS','LAT','LIE','LTU','MOL','MON','MNE','SMR','SVN','VAT','ENG','WAL','SCO','NIR','IRL']
AF = ['EGY','NGA','ALG','MAR','TUN','CMR','CIV','GHA','SEN','COD','ZAM','RSA','BUR','TOG','ANG','UG','GAB','KEN']
AS = ['JPN','KOR','IRN','AUS','KSA','UAE','IRQ','UZB','QAT','SYR','JOR','OMA','PLE','LIB','YEM','KGZ','TJK','VIE','MYA','LAO','CAM','PHI','INA','MAS','SIN','BRU','TLS','NEP','MDV','BHU','PAK','SRI','MDV','TPE','HKG','MAC','MNG','PRK','TKM','AFG','KUW','BHR','MLD','LBN']
N_A = ['USA','MEX','CRC','HON','JAM','PAN','SLV','GUA','HAI','CAN','TRI','SKN','VIN','LCA','DMA','GRN','CUW','ARU','BES','SXM','MTQ''GLP','GUF']
OC = ['AUS','NZL','PNG','FIJ','VAN','SOL','NCL','TAH','SAM','KIR','TUV','PLW','FSM','MHL','NRU','COK','WLF','NIU','TKL','PCN','HMD','ATF','NFK','CXR','CCK','ASM','PYF','GUM','MNP','VIR']

In [12]:
df_final[(df_final["Nation"].isin(AF)) & (df_final["Age"]<22) & (df_final["90s"]*90>500)].count()

Player     37
Nation     37
Pos        37
Squad      37
Comp       37
           ..
B_Pass     37
Int        37
Tkl+Int    37
Clr        37
Err        37
Length: 102, dtype: int64

In [28]:
non_eu_youngsters = df_final[(~df_final["Nation"].isin(EU)) & (df_final["Age"]<21) & (df_final["90s"]*90>800)]

continent_counts = {
    'AF': non_eu_youngsters[non_eu_youngsters["Nation"].isin(AF)].shape[0],
    'AS': non_eu_youngsters[non_eu_youngsters["Nation"].isin(AS)].shape[0],
    'N_A': non_eu_youngsters[non_eu_youngsters["Nation"].isin(N_A)].shape[0],
    'OC': non_eu_youngsters[non_eu_youngsters["Nation"].isin(OC)].shape[0],
    'SA': non_eu_youngsters[non_eu_youngsters["Nation"].isin(SA)].shape[0]
}

continent_counts_df = pd.DataFrame(list(continent_counts.items()), columns=['Continent', 'Player Count'])
continent_counts_df

Unnamed: 0,Continent,Player Count
0,AF,17
1,AS,1
2,N_A,3
3,OC,0
4,SA,9


In [27]:
continent_counts_df.to_csv("continent_counts_df.csv")