In [1]:
#IMPORTS
import os
import pandas as pd
import numpy as np
import csv
from scipy.stats import norm, uniform, beta, multivariate_normal, stats
from datetime import datetime

#Libraries needed for ML
import torch
import sklearn.datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct, WhiteKernel, Matern
from sklearn.gaussian_process import GaussianProcessRegressor


# Options for pandas
pd.options.display.max_columns = 10
pd.options.display.max_rows = 20

# Visualizations
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns
#%matplotlib inline
%matplotlib widget
#%config InlineBackend.figure_format = 'svg'
%config InlineBackend.figure_format = 'retina'
plt.style.use("default")

from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
print(os.getcwd())
os.path.expanduser("/Machine_Learning/Machine_Learning_Scripts/")

/Users/Ian_1/Desktop/Python/Machine_Learning/Machine_Learning_Scripts/Player_peformance_predictor


'/Machine_Learning/Machine_Learning_Scripts/'

In [22]:
path_2019_2020 = 'https://fbref.com/en/players/57d88cf9/matchlogs/2019-2020/Jude-Bellingham-Match-Logs'
path_2020_2021 = 'https://fbref.com/en/players/57d88cf9/matchlogs/2020-2021/Jude-Bellingham-Match-Logs'
path_2021_2022 = 'https://fbref.com/en/players/57d88cf9/matchlogs/2021-2022/Jude-Bellingham-Match-Logs'
path_2022_2023 = 'https://fbref.com/en/players/57d88cf9/matchlogs/2022-2023/Jude-Bellingham-Match-Logs'
path_2023_2024 = 'https://fbref.com/en/players/57d88cf9/matchlogs/2023-2024/Jude-Bellingham-Match-Logs'

## Helper functions

In [4]:
def display_full(x):
    """
    Author: Benjamin Ziepert, Karl Adler
    Source: https://stackoverflow.com/questions/25351968/how-can-i-display-full-non-truncated-dataframe-information-in-html-when-conver
    """
    with pd.option_context('display.max_rows', None,
                           'display.max_columns', None,
                           'display.width', 2000,
                           'display.float_format', '{:20,.2f}'.format,
                           'display.max_colwidth', None):
        display(x)

In [5]:
#SPLIT COLUMN NAMES WITH TWO ENTRIES
#df['Age'] = df['Age'].str[:2]
#df['Position_2'] = df['Pos'].str[3:]
#df['Position'] = df['Pos'].str[:2]
#df['Nation'] = df['Nation'].str.split(' ').str.get(1)
#df['League'] = df['Comp'].str.split(' ').str.get(1)
#df['League_'] = df['Comp'].str.split(' ').str.get(2)
#df['League'] = df['League'] + ' ' + df['League_']
#df = df.drop(columns=['League_', 'Comp', 'Rk', 'Pos','Matches'])
#
#df['Position'] = df['Position'].replace({'MF': 'Midfielder', 'DF': 'Defender', 'FW': 'Forward', 'GK': 'Goalkeeper'})
#df['Position_2'] = df['Position_2'].replace({'MF': 'Midfielder', 'DF': 'Defender',
#                                                 'FW': 'Forward', 'GK': 'Goalkeeper'})
#df['League'] = df['League'].fillna('Bundesliga')

In [121]:
def extract_df(path):
    df = pd.read_html(path)[0]
    df.columns = [' '.join(col).strip() for col in df.columns]
    df = df.reset_index(drop=True)
    # creating a list with new names
    new_columns = []
    for col in df.columns:
        if 'level_0' in col:
            new_col = col.split()[-1]  # takes the last name
        else:
            new_col = col
        new_columns.append(new_col)
    # rename columns
    df.columns = new_columns
    dropped_columns = ['Day','Squad','Opponent','Comp','Round','Report','Performance PKatt']
    renamed_columns = {'Performance Gls':'G','Performance Ast':'Ass','Performance PK':'PK',
                      'Performance Sh':'Shots','Performance SoT':'SoT','Performance CrdY':'YC','Performance CrdR':'RC','Performance Touches':'Touches',
                      'Performance Tkl':'Tkl','Performance Int':'Int','Performance Blocks':'Blocks','Expected xG':'xG','Expected npxG':'npxG',
                      'Expected xAG':'xAG'}
    df = df.drop(dropped_columns, axis=1)
    df = df.dropna()
    df = df.rename(columns=renamed_columns)
    df = df.replace("On matchday squad, but did not play", 0)
    df.insert(5, 'Pos_1', df['Pos'].str[:2])
    df.insert(6, 'Pos_2', df['Pos'].str[3:5])
    df.insert(7, 'Pos_3', df['Pos'].str[6:])
    df['Result'] = df['Result'].apply(lambda x: 0 if 'L' in x else 1) #0 if they lost, 1 if they won
    df['Venue'] = df['Venue'].apply(lambda x: 0 if 'Away' in x else 1) #0 if away, 1 if home
    df['Start'] = df['Start'].apply(lambda x: 0 if 'N' in x else 1) #0 if didnt start, 1 if did
    df = df.drop('Pos', axis=1)
    mapping = {'CB':1,'LB':2,'DM':3,'RM':4,'LM':5,'CM':6,'RW':7,'LW':8,'AM':9,'FW':10}
    df = df.replace({"Pos_1": mapping})
    df = df.replace({"Pos_2": mapping})
    df = df.replace({"Pos_3": mapping})
    df = df.replace("", 0)
    df = df.fillna(0)
    
    return(df, list(df.columns.values)) 

In [122]:
df, column_names = extract_df(path_championship)

In [123]:
display_full(df)

Unnamed: 0,Date,Venue,Result,Start,Pos_1,Pos_2,Pos_3,Min,G,Ass,PK,Shots,SoT,YC,RC,Touches,Tkl,Int,Blocks,xG,npxG,xAG,SCA SCA,SCA GCA,Passes Cmp,Passes Att,Passes Cmp%,Passes PrgP,Carries Carries,Carries PrgC,Take-Ons Att,Take-Ons Succ
2,2019-08-25,0,0,0,10.0,0.0,0.0,15,0,0,0,0,0,1,0,4,0,0,0,0.0,0.0,0.0,0,0,1,2,50.0,0,2,0,1,0
3,2019-08-31,1,1,0,4.0,0.0,0.0,61,1,0,0,1,1,0,0,37,2,0,4,0.0,0.0,0.0,1,0,16,27,59.3,1,17,1,2,2
4,2019-09-14,0,1,1,5.0,10.0,0.0,71,1,0,0,1,1,0,0,25,1,0,0,0.1,0.1,0.0,0,0,12,18,66.7,2,15,1,0,0
5,2019-09-21,1,0,1,5.0,0.0,0.0,60,0,0,0,0,0,0,0,34,2,0,1,0.0,0.0,0.0,0,0,13,24,54.2,2,16,0,0,0
6,2019-09-28,0,0,0,6.0,0.0,0.0,15,0,0,0,1,0,0,0,24,1,0,1,0.0,0.0,0.0,1,0,14,19,73.7,2,14,2,1,1
7,2019-10-01,0,0,0,1.0,6.0,0.0,8,0,0,0,1,0,0,0,6,0,0,0,0.1,0.1,0.0,2,0,3,5,60.0,1,6,1,2,2
8,2019-10-04,1,1,1,6.0,0.0,0.0,73,0,0,0,2,0,0,0,47,2,2,2,0.1,0.1,0.2,4,0,29,34,85.3,3,28,0,4,3
9,2019-10-19,0,0,1,6.0,0.0,0.0,75,0,0,0,1,1,0,0,32,2,2,0,0.0,0.0,0.0,2,0,12,19,63.2,1,17,2,2,1
10,2019-10-22,1,1,1,6.0,0.0,0.0,61,0,0,0,1,1,0,0,36,0,0,1,0.0,0.0,0.0,0,0,24,31,77.4,3,29,0,0,0
11,2019-10-26,1,1,1,6.0,0.0,0.0,86,0,0,0,0,0,0,0,55,5,1,0,0.0,0.0,0.5,2,0,33,44,75.0,4,31,3,4,2


In [None]:
#row_0 = ( pd.DataFrame(df.iloc[0]) ).transpose()
#display_full(row_0)

In [None]:
column_names

DataFrame.drop(labels=None, *, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')