# Data exploration of the football-data.co.uk dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## France Ligue 1

In [3]:
DATA_FOLDER = "./../../data/soccer/football-data.co.uk"
FOLDER_NAME_FRANCE = "F"
LIGUE_1 = "F1"

In [4]:
files = os.listdir(f"{DATA_FOLDER}/{FOLDER_NAME_FRANCE}")
files.sort(reverse=True)
files

['F2-2023-2024.csv',
 'F2-2022-2023.csv',
 'F2-2021-2022.csv',
 'F2-2020-2021.csv',
 'F2-2019-2020.csv',
 'F2-2018-2019.csv',
 'F2-2017-2018.csv',
 'F2-2016-2017.csv',
 'F2-2015-2016.csv',
 'F2-2014-2015.csv',
 'F2-2013-2014.csv',
 'F2-2012-2013.csv',
 'F2-2011-2012.csv',
 'F2-2010-2011.csv',
 'F2-2009-2010.csv',
 'F2-2008-2009.csv',
 'F2-2007-2008.csv',
 'F2-2006-2007.csv',
 'F2-2005-2006.csv',
 'F2-2004-2005.csv',
 'F2-2003-2004.csv',
 'F2-2002-2003.csv',
 'F2-2001-2002.csv',
 'F2-2000-2001.csv',
 'F2-1999-2000.csv',
 'F2-1998-1999.csv',
 'F2-1997-1998.csv',
 'F2-1996-1997.csv',
 'F1-2023-2024.csv',
 'F1-2022-2023.csv',
 'F1-2021-2022.csv',
 'F1-2020-2021.csv',
 'F1-2019-2020.csv',
 'F1-2018-2019.csv',
 'F1-2017-2018.csv',
 'F1-2016-2017.csv',
 'F1-2015-2016.csv',
 'F1-2014-2015.csv',
 'F1-2013-2014.csv',
 'F1-2012-2013.csv',
 'F1-2011-2012.csv',
 'F1-2010-2011.csv',
 'F1-2009-2010.csv',
 'F1-2008-2009.csv',
 'F1-2007-2008.csv',
 'F1-2006-2007.csv',
 'F1-2005-2006.csv',
 'F1-2004-200

In [5]:
# agregate all seasons
df_france_ligue_1 = pd.DataFrame()
files = os.listdir(f"{DATA_FOLDER}/{FOLDER_NAME_FRANCE}")
files.sort(reverse=True)
for file in files:
    if file.startswith(LIGUE_1):
        print(file)
        df = pd.read_csv(f"{DATA_FOLDER}/{FOLDER_NAME_FRANCE}/{file}")
        df_france_ligue_1 = pd.concat([df_france_ligue_1, df], join='outer')

df_france_ligue_1

F1-2023-2024.csv
F1-2022-2023.csv
F1-2021-2022.csv
F1-2020-2021.csv
F1-2019-2020.csv
F1-2018-2019.csv
F1-2017-2018.csv
F1-2016-2017.csv
F1-2015-2016.csv
F1-2014-2015.csv
F1-2013-2014.csv
F1-2012-2013.csv
F1-2011-2012.csv
F1-2010-2011.csv
F1-2009-2010.csv
F1-2008-2009.csv
F1-2007-2008.csv
F1-2006-2007.csv
F1-2005-2006.csv
F1-2004-2005.csv
F1-2003-2004.csv
F1-2002-2003.csv
F1-2001-2002.csv
F1-2000-2001.csv
F1-1999-2000.csv
F1-1998-1999.csv
F1-1997-1998.csv
F1-1996-1997.csv
F1-1995-1996.csv
F1-1994-1995.csv
F1-1993-1994.csv


In [None]:
df_france_ligue_1.head(10)

In [None]:
df_france_ligue_1.describe()

In [None]:
df_france_ligue_1.info()

In [None]:
print(df_france_ligue_1.isnull().sum())

In [None]:
# remove unamed columns
df_france_ligue_1 = df_france_ligue_1.loc[:, ~df_france_ligue_1.columns.str.contains('^Unnamed')]

In [None]:
df_france_ligue_1.head(20)

In [None]:
# Ensure that the date column is in datetime format
df_france_ligue_1['Date'] = pd.to_datetime(df_france_ligue_1['Date'], format='mixed')

# Extract year from date
df_france_ligue_1['year'] = df_france_ligue_1['Date'].dt.year

# Group by year and count non-null values for each column
df_count = df_france_ligue_1.groupby('year').count()


In [None]:

df_count_transposed = df_count.transpose()
# Plot

df_count_transposed.plot(kind='bar', stacked=True, figsize=(30, 10))
plt.title('Number of non-null values per column and year')
plt.xlabel('Column')
plt.ylabel('Count')
plt.show()

In [None]:
# Create a DataFrame indicating where values are null
df_null = df_france_ligue_1.isnull()

# Add year column to this DataFrame
df_null['year'] = df_france_ligue_1['Date'].dt.year

# Group by year and sum to get number of null values per year and column
df_null_count = df_null.groupby('year').sum()

In [None]:
df_count_transposed = df_null_count.transpose()
# Plot

df_count_transposed.plot(kind='bar', stacked=True, figsize=(30, 10))
plt.title('Number of non-null values per column and year')
plt.xlabel('Column')
plt.ylabel('Count')
plt.show()

In [None]:
col_name = list(df_france_ligue_1.columns)
'FTR' in col_name

In [None]:
df_france_ligue_1_null_mat = df_france_ligue_1.groupby('year').apply(lambda x: ((x.isnull().sum())/len(x))*100).astype(int)
df_france_ligue_1_null_mat

In [None]:
plt.imshow(df_france_ligue_1_null_mat)

In [None]:
# Create a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_france_ligue_1_null_mat, cmap='Blues', cbar=True)

plt.title('Percentage of missing values per year and column')
plt.xlabel('Column')
plt.ylabel('Year')
plt.show()

### 2006-2024 seasons L1 France

In [None]:
df_france_ligue_1_2006_2024 = df_france_ligue_1[(df_france_ligue_1['year'] >= 2006) & (df_france_ligue_1['year'] <= 2024)]

In [None]:
df_france_ligue_1_2006_2024.head(10)

In [None]:
#remove columns with more than 5% of missing values

df_france_ligue_1_2006_2024 = df_france_ligue_1_2006_2024.loc[:, df_france_ligue_1_2006_2024.isnull().mean() < .05]


In [None]:
df_france_ligue_1_2006_2024.info()

In [None]:
df_france_ligue_1_2006_2024.head(5)

In [None]:
df_france_ligue_1_2006_2024.tail(5)

In [None]:
# number of nulls per column per years
df_france_ligue_1_2006_2024.groupby('year').apply(lambda x: x.isnull().sum())


In [None]:
# Add neww colum to say if bet365 win the prediction or not (1 if yes, 0 if no)
# if FTR == H and bet365 == H or FTR == A and bet365 == A or FTR == D and bet365 == D then bet365_win = 1 else 0

df_france_ligue_1_2006_2024['bet365_win'] = np.where(((df_france_ligue_1_2006_2024['FTR'] == 'H') & (df_france_ligue_1_2006_2024['B365H'] > df_france_ligue_1_2006_2024['B365D']) & (df_france_ligue_1_2006_2024['B365H'] > df_france_ligue_1_2006_2024['B365A'])) | ((df_france_ligue_1_2006_2024['FTR'] == 'A') & (df_france_ligue_1_2006_2024['B365A'] > df_france_ligue_1_2006_2024['B365D']) & (df_france_ligue_1_2006_2024['B365A'] > df_france_ligue_1_2006_2024['B365H'])) | ((df_france_ligue_1_2006_2024['FTR'] == 'D') & (df_france_ligue_1_2006_2024['B365D'] > df_france_ligue_1_2006_2024['B365H']) & (df_france_ligue_1_2006_2024['B365D'] > df_france_ligue_1_2006_2024['B365A'])), 1, 0)


In [None]:
print(list(df_france_ligue_1_2006_2024.columns))

In [None]:
def calculate_win(row, bookmaker="B365", result_by_outcome=False):
    
    result = row["FTR"]
    home_odd = 1/float(row[f"{bookmaker}H"])
    away_odd = 1/float(row[f"{bookmaker}A"])
    draw_odd = 1/float(row[f"{bookmaker}D"])

    result_HDA = [0, 0, 0]

    if result == "H" and home_odd > away_odd and home_odd > draw_odd:
        result_HDA[0] = 1
    if result == "D" and draw_odd > away_odd and draw_odd > home_odd:
        result_HDA[1] = 1
    if result == "A" and away_odd > home_odd and away_odd > draw_odd:
        result_HDA[2] = 1

    if result_by_outcome:
        return result_HDA
    return sum(result_HDA)

In [None]:
df_france_ligue_1_2006_2024['MEANH'] = df_france_ligue_1_2006_2024.apply(lambda x : (x['B365H'] + x['BWH'] + x['IWH'] + x['WHH'] + x['VCH'])/5, axis=1)
df_france_ligue_1_2006_2024['MEAND'] = df_france_ligue_1_2006_2024.apply(lambda x : (x['B365D'] + x['BWD'] + x['IWD'] + x['WHD'] + x['VCD'])/5, axis=1)
df_france_ligue_1_2006_2024['MEANA'] = df_france_ligue_1_2006_2024.apply(lambda x : (x['B365A'] + x['BWA'] + x['IWA'] + x['WHA'] + x['VCA'])/5, axis=1)

In [None]:
bookmakers = ['B365', 'BW', 'IW', 'WH', 'VC', 'MEAN']
accuracy_overall = []
for bookmaker in bookmakers:
    df_france_ligue_1_2006_2024[f'{bookmaker}_win'] = df_france_ligue_1_2006_2024.apply(calculate_win, axis=1, args=(bookmaker,))
    accuracy = df_france_ligue_1_2006_2024[f'{bookmaker}_win'].sum()/len(df_france_ligue_1_2006_2024[f'{bookmaker}_win'])
    print(f'{bookmaker} accuracy: {accuracy}')
    accuracy_overall.append(accuracy)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(bookmakers, accuracy_overall)
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy of bookmakers')
plt.show()

In [None]:
accuracy_home = []
accuracy_draw = []
accuracy_away = []

print('unbalanced accuracy')
for bookmaker in bookmakers:
    df_france_ligue_1_2006_2024[f'{bookmaker}_home_win'], df_france_ligue_1_2006_2024[f'{bookmaker}_draw'], df_france_ligue_1_2006_2024[f'{bookmaker}_away_win'] = zip(*df_france_ligue_1_2006_2024.apply(calculate_win, axis=1, args=(bookmaker, True)))
    accuracy_home_ = df_france_ligue_1_2006_2024[f'{bookmaker}_home_win'].sum()/len(df_france_ligue_1_2006_2024[f'{bookmaker}_home_win'])
    accuracy_home.append(accuracy_home_)
    accuracy_draw_ = df_france_ligue_1_2006_2024[f'{bookmaker}_draw'].sum()/len(df_france_ligue_1_2006_2024[f'{bookmaker}_draw'])
    accuracy_draw.append(accuracy_draw_)
    accuracy_away_ = df_france_ligue_1_2006_2024[f'{bookmaker}_away_win'].sum()/len(df_france_ligue_1_2006_2024[f'{bookmaker}_away_win'])
    accuracy_away.append(accuracy_away_)

    print(f'{bookmaker} accuracy home: {accuracy_home_}')
    print(f'{bookmaker} accuracy draw: {accuracy_draw_}')
    print(f'{bookmaker} accuracy away: {accuracy_away_}')


In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(bookmakers, accuracy_home, label='Home')
ax.bar(bookmakers, accuracy_draw, label='Draw', bottom=accuracy_home)
ax.bar(bookmakers, accuracy_away, label='Away', bottom=np.array(accuracy_home)+np.array(accuracy_draw))

ax.set_ylabel('Accuracy')
ax.set_title('Accuracy of bookmakers by outcome')
ax.legend()
plt.show()

In [None]:
outcomes = ['H', 'D', 'A']
df_france_ligue_1_2006_2024[f'random_bet'] = np.random.choice(outcomes, len(df_france_ligue_1_2006_2024))
df_france_ligue_1_2006_2024['random_win'] = df_france_ligue_1_2006_2024.apply(lambda x: 1 if x['random_bet'] == x['FTR'] else 0, axis=1)
df_france_ligue_1_2006_2024['random_win'].sum()/len(df_france_ligue_1_2006_2024['random_win'])

In [None]:
df_france_ligue_1_2006_2024['home_bet'] = 'H'
df_france_ligue_1_2006_2024['home_bet'] = df_france_ligue_1_2006_2024.apply(lambda x: 1 if x['home_bet'] == x['FTR'] else 0, axis=1)
df_france_ligue_1_2006_2024['home_bet'].sum()/len(df_france_ligue_1_2006_2024['home_bet'])

In [None]:
df_france_ligue_1_2006_2024['away_bet'] = 'A'
df_france_ligue_1_2006_2024['away_bet'] = df_france_ligue_1_2006_2024.apply(lambda x: 1 if x['away_bet'] == x['FTR'] else 0, axis=1)
df_france_ligue_1_2006_2024['away_bet'].sum()/len(df_france_ligue_1_2006_2024['away_bet'])

In [None]:
df_france_ligue_1_2006_2024['draw_bet'] = 'D'
df_france_ligue_1_2006_2024['draw_bet'] = df_france_ligue_1_2006_2024.apply(lambda x: 1 if x['draw_bet'] == x['FTR'] else 0, axis=1)
df_france_ligue_1_2006_2024['draw_bet'].sum()/len(df_france_ligue_1_2006_2024['draw_bet'])

In [None]:
len(df_france_ligue_1_2006_2024[df_france_ligue_1_2006_2024['FTR'] == 'H'])/len(df_france_ligue_1_2006_2024)

In [None]:
print('balanced accuracy')
balanced_accuracy_home = []
balanced_accuracy_away = []
balanced_accuracy_draw = []
for bookmaker in bookmakers:
    df_france_ligue_1_2006_2024[f'{bookmaker}_home_win'], df_france_ligue_1_2006_2024[f'{bookmaker}_draw'], df_france_ligue_1_2006_2024[f'{bookmaker}_away_win'] = zip(*df_france_ligue_1_2006_2024.apply(calculate_win, axis=1, args=(bookmaker, True)))
    accuracy_home_ = df_france_ligue_1_2006_2024[f'{bookmaker}_home_win'].sum()/len(df_france_ligue_1_2006_2024[df_france_ligue_1_2006_2024['FTR'] == 'H'])
    balanced_accuracy_home.append(accuracy_home_)
    accuracy_draw_ = df_france_ligue_1_2006_2024[f'{bookmaker}_draw'].sum()/len(df_france_ligue_1_2006_2024[df_france_ligue_1_2006_2024['FTR'] == 'D'])
    balanced_accuracy_draw.append(accuracy_draw_)
    accuracy_away_ = df_france_ligue_1_2006_2024[f'{bookmaker}_away_win'].sum()/len(df_france_ligue_1_2006_2024[df_france_ligue_1_2006_2024['FTR'] == 'A'])
    balanced_accuracy_away.append(accuracy_away_)

    print(f'{bookmaker} accuracy home: {accuracy_home_}')
    print(f'{bookmaker} accuracy draw: {accuracy_draw_}')
    print(f'{bookmaker} accuracy away: {accuracy_away_}')

In [None]:
bookmakers

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
# Largeur des barres
bar_width = 0.3

# Positions des barres
r1 = np.arange(len(bookmakers))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]

ax.bar(r1, balanced_accuracy_home, width=bar_width,label='Home')
ax.bar(r2, balanced_accuracy_draw, width=bar_width,label='Draw')
ax.bar(r3, balanced_accuracy_away, width=bar_width,label='Away')

ax.set_xticks([r + bar_width for r in range(len(bookmakers))])
ax.set_xticklabels(bookmakers)


ax.set_ylabel('Accuracy')
ax.set_title('Accuracy of bookmakers by outcome')
ax.legend()
plt.show()

In [None]:
df_france_ligue_1_2006_2024.columns