Importing packages

In [76]:
# Import necessary packages and suppress warnings


import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Arc
import seaborn as sns
import warnings
import gzip
import json
import os
import time
import math
import requests

#Soccer specific packages
from statsbombpy import sb
from mplsoccer import Pitch
from mplsoccer import VerticalPitch

#modeling packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, cross_val_predict
from imblearn.over_sampling import SMOTE
from sklearn import metrics 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, log_loss
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import time
import requests

Scraping the standard stats data from FBRef

In [24]:
# List of base URLs for different football leagues
base_urls = [
    'https://fbref.com/en/comps/10/2022-2023/2022-2023-Championship-Stats',
    'https://fbref.com/en/comps/20/2022-2023/2022-2023-Bundesliga-Stats',
    'https://fbref.com/en/comps/33/2022-2023/2022-2023-2-Bundesliga-Stats',
    'https://fbref.com/en/comps/13/2022-2023/2022-2023-Ligue-1-Stats',
    'https://fbref.com/en/comps/60/2022-2023/2022-2023-Ligue-2-Stats',
    'https://fbref.com/en/comps/11/2022-2023/2022-2023-Serie-A-Stats',
    'https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats',
    'https://fbref.com/en/comps/17/2022-2023/2022-2023-Segunda-Division-Stats',
    'https://fbref.com/en/comps/12/2022-2023/2022-2023-La-Liga-Stats',
    'https://fbref.com/en/comps/18/2022-2023/2022-2023-Serie-B-Stats',
]

# Define the request rate and delay to stay below 20 requests per minute
requests_per_minute = 20
delay = 60 / requests_per_minute

# Create an empty DataFrame to store the scraped data
all_seasons_data = pd.DataFrame()

# Initialize a dictionary to keep track of last request timestamps
last_request_time = {url: 0 for url in base_urls}

start_time = time.time()

# Loop through the last 20 seasons for each league
for base_url in base_urls:
    for season in range(2003, 2023):
        season_str = f'{season}-{season+1}'
        season_url = base_url.format(season, season_str)
        
        # Calculate time elapsed since the last request to the same URL
        elapsed_time = time.time() - last_request_time[base_url]
        
        # Check if it's necessary to wait to avoid exceeding the rate limit
        if elapsed_time < delay:
            wait_time = delay - elapsed_time
            time.sleep(wait_time)
        
        # Make the request
        response = requests.get(season_url)
        
        # Update the last request time for the URL
        last_request_time[base_url] = time.time()
        
        # Check if the request was successful
        if response.status_code == 200:
            season_data = pd.read_html(response.content)[0]

            # Extract the year from the season string and add it to the DataFrame
            year = season
            season_data['Year'] = year

            # Extract the league name from the URL using a slice from -18 to -6
            league_name = season_url[-18:-6]
            season_data['League'] = league_name

            # Append the data to the main DataFrame
            all_seasons_data = all_seasons_data.append(season_data, ignore_index=True)
            
            # Print a message when data is appended
            print(f"Appended data for {season_str}")
        else:
            print(f"Failed to retrieve data for {season_str}: Status Code {response.status_code}")

# Display information about the DataFrame
all_seasons_data.info()

end_time = time.time()

# Calculate the elapsed time in seconds
elapsed_time = end_time - start_time

print(f"The cell took {elapsed_time:.6f} seconds to run.")



Appended data for 2003-2004
Appended data for 2004-2005
Appended data for 2005-2006
Appended data for 2006-2007
Appended data for 2007-2008
Appended data for 2008-2009
Appended data for 2009-2010
Appended data for 2010-2011
Appended data for 2011-2012
Appended data for 2012-2013
Appended data for 2013-2014
Appended data for 2014-2015
Appended data for 2015-2016
Appended data for 2016-2017
Appended data for 2017-2018
Appended data for 2018-2019
Appended data for 2019-2020
Appended data for 2020-2021
Appended data for 2021-2022
Appended data for 2022-2023
Appended data for 2003-2004
Appended data for 2004-2005
Appended data for 2005-2006
Appended data for 2006-2007
Appended data for 2007-2008
Appended data for 2008-2009
Appended data for 2009-2010
Appended data for 2010-2011
Appended data for 2011-2012
Appended data for 2012-2013
Appended data for 2013-2014
Appended data for 2014-2015
Appended data for 2015-2016
Appended data for 2016-2017
Appended data for 2017-2018
Appended data for 20

Data Cleaning

In [82]:
all_seasons_data.head()

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,...,Notes,Year,League,target,GPG,GCPG,G/PG,GC/PG,xG/PG,xGA/PG
0,1,Burnley,46,29,14,3,87,35,52,101,...,,2003,Championship,0,1.89,0.76,1.89,0.76,1.44,0.83
1,2,Sheffield Utd,46,28,7,11,73,39,34,91,...,,2003,Championship,0,1.59,0.85,1.59,0.85,1.56,0.96
2,3,Luton Town,46,21,17,8,57,39,18,80,...,,2003,Championship,0,1.24,0.85,1.24,0.85,1.27,0.95
3,4,Middlesbrough,46,22,9,15,84,56,28,75,...,,2003,Championship,0,1.83,1.22,1.83,1.22,1.58,1.05
4,5,Coventry City,46,18,16,12,58,46,12,70,...,,2003,Championship,1,1.26,1.0,1.26,1.0,1.37,1.19


In [83]:
all_seasons_data['League'].value_counts()

Championship    480
Segunda         440
Ligue_1         400
Ligue_2         400
Serie_A         400
EPL             400
La_Liga         400
Serie_B         400
Bundesliga      360
2_Bundesliga    360
Name: League, dtype: int64

In [84]:
# Create a dictionary to map the keys to the desired values
league_name_mapping = {
    'Championship': 'Championship',
    'nda-Division': 'Segunda',
    '2023-Ligue-1': 'Ligue_1',
    '2023-Ligue-2': 'Ligue_2',
    '2023-Serie-A': 'Serie_A',
    'emier-League': 'EPL',
    '2023-La-Liga': 'La_Liga',
    '2023-Serie-B': 'Serie_B',
    '3-Bundesliga': 'Bundesliga',
    '2-Bundesliga': '2_Bundesliga'
}

# Rename the 'League' column values using the mapping
all_seasons_data['League'] = all_seasons_data['League'].replace(league_name_mapping)


In [85]:
all_seasons_data['League'].value_counts()

Championship    480
Segunda         440
Ligue_1         400
Ligue_2         400
Serie_A         400
EPL             400
La_Liga         400
Serie_B         400
Bundesliga      360
2_Bundesliga    360
Name: League, dtype: int64

In [86]:
all_seasons_data.columns

Index(['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP',
       'xG', 'xGA', 'xGD', 'xGD/90', 'Top Team Scorer', 'Notes', 'Year',
       'League', 'target', 'GPG', 'GCPG', 'G/PG', 'GC/PG', 'xG/PG', 'xGA/PG'],
      dtype='object')

In [87]:
# Drop the 'Goalkeeper' and 'Attendance' columns
all_seasons_data = all_seasons_data.drop(['Goalkeeper', 'Attendance'], axis=1)

KeyError: "['Goalkeeper', 'Attendance'] not found in axis"

In [88]:
all_seasons_data['Notes'].value_counts()

    1900
Name: Notes, dtype: int64

In [89]:
all_seasons_data['Rk'].value_counts()

1     200
11    200
18    200
17    200
16    200
15    200
14    200
2     200
12    200
13    200
10    200
9     200
8     200
7     200
6     200
5     200
4     200
3     200
19    160
20    160
21     40
22     40
23     20
24     20
Name: Rk, dtype: int64

In [90]:
all_seasons_data['MP'].value_counts()

38    2400
34     720
46     480
42     440
Name: MP, dtype: int64

In [91]:
# Define a function to set the 'target' column based on the conditions
def set_target(row):
    if 1 <= row['Rk'] <= 4:
        return 0
    if (row['MP'] == 38 and 17 <= row['Rk'] <= 20) or \
       (row['MP'] == 34 and 15 <= row['Rk'] <= 18) or \
       (row['MP'] == 42 and 19 <= row['Rk'] <= 22):
        return 2
    return 1

# Apply the function to create the 'target' column in 'all_seasons_data'
all_seasons_data['target'] = all_seasons_data.apply(set_target, axis=1)

# Print the resulting DataFrame
all_seasons_data

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,...,Notes,Year,League,target,GPG,GCPG,G/PG,GC/PG,xG/PG,xGA/PG
0,1,Burnley,46,29,14,3,87,35,52,101,...,,2003,Championship,0,1.89,0.76,1.89,0.76,1.44,0.83
1,2,Sheffield Utd,46,28,7,11,73,39,34,91,...,,2003,Championship,0,1.59,0.85,1.59,0.85,1.56,0.96
2,3,Luton Town,46,21,17,8,57,39,18,80,...,,2003,Championship,0,1.24,0.85,1.24,0.85,1.27,0.95
3,4,Middlesbrough,46,22,9,15,84,56,28,75,...,,2003,Championship,0,1.83,1.22,1.83,1.22,1.58,1.05
4,5,Coventry City,46,18,16,12,58,46,12,70,...,,2003,Championship,1,1.26,1.00,1.26,1.00,1.37,1.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4035,16,Brescia,38,9,13,16,36,57,-21,40,...,,2022,Serie_B,1,0.95,1.50,0.95,1.50,0.92,1.35
4036,17,Cosenza,38,9,13,16,30,53,-23,40,...,,2022,Serie_B,2,0.79,1.39,0.79,1.39,0.88,1.21
4037,18,Perugia,38,10,9,19,40,52,-12,39,...,,2022,Serie_B,2,1.05,1.37,1.05,1.37,1.24,1.12
4038,19,SPAL,38,8,14,16,41,51,-10,38,...,,2022,Serie_B,2,1.08,1.34,1.08,1.34,1.10,1.36


In [92]:
# Add a new column 'G/PG' to the 'all_seasons_data' DataFrame
all_seasons_data['G/PG'] = (all_seasons_data['GF'] / all_seasons_data['MP']).round(2)

# Add a new column 'GC/PG' to the 'all_seasons_data' DataFrame
all_seasons_data['GC/PG'] = (all_seasons_data['GA'] / all_seasons_data['MP']).round(2)

In [93]:
# Add a new column 'xG/PG' to the 'all_seasons_data' DataFrame
all_seasons_data['xG/PG'] = (all_seasons_data['xG'] / all_seasons_data['MP']).round(2)

# Add a new column 'GC/PG' to the 'all_seasons_data' DataFrame
all_seasons_data['xGA/PG'] = (all_seasons_data['xGA'] / all_seasons_data['MP']).round(2)

In [94]:
all_seasons_data.columns

Index(['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP',
       'xG', 'xGA', 'xGD', 'xGD/90', 'Top Team Scorer', 'Notes', 'Year',
       'League', 'target', 'GPG', 'GCPG', 'G/PG', 'GC/PG', 'xG/PG', 'xGA/PG'],
      dtype='object')

In [95]:
# Drop the 'Notes' and 'Year' columns from the DataFrame
df = all_seasons_data.drop(['Notes', 'Year', 'GPG', 'GCPG'], axis=1)

# Print the resulting DataFrame
df

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,...,xGA,xGD,xGD/90,Top Team Scorer,League,target,G/PG,GC/PG,xG/PG,xGA/PG
0,1,Burnley,46,29,14,3,87,35,52,101,...,38.2,28.0,0.61,Nathan Tella - 17,Championship,0,1.89,0.76,1.44,0.83
1,2,Sheffield Utd,46,28,7,11,73,39,34,91,...,44.3,27.4,0.60,Iliman Ndiaye - 14,Championship,0,1.59,0.85,1.56,0.96
2,3,Luton Town,46,21,17,8,57,39,18,80,...,43.8,14.4,0.31,Carlton Morris - 20,Championship,0,1.24,0.85,1.27,0.95
3,4,Middlesbrough,46,22,9,15,84,56,28,75,...,48.3,24.3,0.53,Chuba Akpom - 28,Championship,0,1.83,1.22,1.58,1.05
4,5,Coventry City,46,18,16,12,58,46,12,70,...,54.7,8.2,0.18,Viktor Gyökeres - 21,Championship,1,1.26,1.00,1.37,1.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4035,16,Brescia,38,9,13,16,36,57,-21,40,...,51.2,-16.4,-0.43,Florian Ayé - 8,Serie_B,1,0.95,1.50,0.92,1.35
4036,17,Cosenza,38,9,13,16,30,53,-23,40,...,46.1,-12.6,-0.33,"Christian D'Urso, Marco Nasti - 4",Serie_B,2,0.79,1.39,0.88,1.21
4037,18,Perugia,38,10,9,19,40,52,-12,39,...,42.4,4.9,0.13,Tiago Casasola - 9,Serie_B,2,1.05,1.37,1.24,1.12
4038,19,SPAL,38,8,14,16,41,51,-10,38,...,51.7,-9.9,-0.26,Gabriele Moncini - 9,Serie_B,2,1.08,1.34,1.10,1.36


In [96]:
# Remove all non-numeric characters from the 'Top Team Scorer' column
df['Top_Team_Scorer'] = df['Top Team Scorer'].str.replace(r'\D', '', regex=True)

# Print the resulting DataFrame
df['Top_Team_Scorer']

0       17
1       14
2       20
3       28
4       21
        ..
4035     8
4036     4
4037     9
4038     9
4039     5
Name: Top_Team_Scorer, Length: 4040, dtype: object

In [101]:
df = df.drop(['Top Team Scorer'], axis=1)

Index(['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP',
       'xG', 'xGA', 'xGD', 'xGD/90', 'League', 'target', 'G/PG', 'GC/PG',
       'xG/PG', 'xGA/PG', 'Top_Team_Scorer'],
      dtype='object')

In [102]:
df.columns

Index(['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP',
       'xG', 'xGA', 'xGD', 'xGD/90', 'League', 'target', 'G/PG', 'GC/PG',
       'xG/PG', 'xGA/PG', 'Top_Team_Scorer'],
      dtype='object')

In [103]:
all_seasons_data.to_csv('FBrefData.csv', index=False)