In [1]:
import csv
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OrdinalEncoder

from constants import *

# Embeds graphs in Jupyter notebook (instead of pop-ups)
%matplotlib inline

# pd.set_option('display.max_columns', None) # show all columns

In [2]:
COUNTRIES = ['ENG', 'SPA', 'ITA', 'GER', 'FRA']

In [3]:
individual_score_dfs = {}

for country in COUNTRIES:
    for year in range(2019, 2024):
        try:
            individual_score_dfs[f"{year}_{country}"] = pd.read_csv(
                f"./data/scores/raw/{country}{year}_scores.csv",
                encoding='utf-8'
            )
        except FileNotFoundError as e:
            print(e)
            pass

raw_scores_df = pd.concat(individual_score_dfs.values(), ignore_index=True)

In [4]:
dropped_cols = []
individual_dfs = {}

for country in COUNTRIES:
    for year in range(2018, 2023):
        try:
            df = pd.read_csv(
                f"./data/teams/raw/{country}{year}_teams_for_{year+1}.csv",
                encoding='utf-8'
            )
            cols_to_drop = df.columns[df.isnull().all()].tolist()
            df.drop(columns=cols_to_drop, inplace=True)
            individual_dfs[f"{year}_{country}"] = df
            for col in cols_to_drop:
                if col not in dropped_cols:
                    dropped_cols.append(col)
        except FileNotFoundError:
            print(f"File not found for {year}")
            pass

if len(individual_dfs.values()) > 0:
    main_df = pd.concat(
        individual_dfs.values(),
        join="inner",
        ignore_index=True
    )
    print(f"Dropped cols: {dropped_cols}")
else:
    print(f"No dataframes found for any year")

Dropped cols: []


In [5]:
teams_perf = set(main_df['squad'].values)
teams_scores = set(raw_scores_df['home_team'].values).union(set(raw_scores_df['away_team'].values))

old_names = sorted(list(teams_perf - teams_scores))
new_names = sorted(list(teams_scores - teams_perf))

print(f'{len(old_names)} conflicts: {old_names}')
print(f'{len(new_names)} conflicts: {new_names}')

34 conflicts: ['Ajaccio', 'Alavés', 'Almería', 'Arminia', 'Athletic Club', 'Atlético Madrid', 'Clermont Foot', 'Cádiz', 'Darmstadt 98', 'Düsseldorf', 'Eint Frankfurt', 'Granada', 'Greuther Fürth', 'Hellas Verona', 'Hertha BSC', 'Köln', 'Leeds United', 'Leganés', 'Leicester City', 'Leverkusen', 'Luton Town', "M'Gladbach", 'Mainz 05', 'Milan', 'Newcastle Utd', 'Norwich City', "Nott'ham Forest", 'Nîmes', 'Paderborn 07', 'Paris S-G', 'Roma', 'SPAL', 'Saint-Étienne', 'Schalke 04']
39 conflicts: ['AC Ajaccio', 'AC Milan', 'AS Roma', 'Alaves', 'Almeria', 'Arminia Bielefeld', 'Ath Bilbao', 'Atl. Madrid', 'B. Monchengladbach', 'Bayer Leverkusen', 'Cadiz CF', 'Clermont', 'Darmstadt', 'Dusseldorf', 'Eintracht Frankfurt', 'FC Koln', 'Granada CF', 'Grenoble', 'Greuther Furth', 'Hamburger SV', 'Hertha Berlin', 'Holstein Kiel', 'Leeds', 'Leganes', 'Leicester', 'Luton', 'Mainz', 'Newcastle', 'Nimes', 'Norwich', 'Nottingham', 'PSG', 'Paderborn', 'Paris FC', 'Schalke', 'Sochaux', 'Spal', 'St Etienne', '

In [6]:
name_mapping = {
    'Ajaccio': 'AC Ajaccio',
    'Alavés': 'Alaves',
    'Almería': 'Almeria',
    'Arminia': 'Arminia Bielefeld',
    'Athletic Club': 'Ath Bilbao',
    'Atlético Madrid': 'Atl. Madrid',
    'Clermont Foot': 'Clermont',
    'Cádiz': 'Cadiz CF',
    'Darmstadt 98': 'Darmstadt',
    'Düsseldorf': 'Dusseldorf',
    'Eint Frankfurt': 'Eintracht Frankfurt',
    'Granada': 'Granada CF',
    'Greuther Fürth': 'Greuther Furth',
    'Hellas Verona': 'Verona',
    'Hertha BSC': 'Hertha Berlin',
    'Köln': 'FC Koln',
    'Leeds United': 'Leeds',
    'Leganés': 'Leganes',
    'Leicester City': 'Leicester',
    'Leverkusen': 'Bayer Leverkusen',
    'Luton Town': 'Luton',
    "M'Gladbach": 'B. Monchengladbach',
    'Mainz 05': 'Mainz',
    'Newcastle Utd': 'Newcastle',
    'Milan': 'AC Milan',
    'Norwich City': 'Norwich',
    "Nott'ham Forest": 'Nottingham',
    'Nîmes': 'Nimes',
    'Paderborn 07': 'Paderborn',
    'Paris S-G': 'PSG',
    'Roma': 'AS Roma',
    'SPAL': 'Spal',
    'Saint-Étienne': 'St Etienne',
    'Schalke 04': 'Schalke'
}

UNRESOLVED CONFLICTS FOR TEAMS: 

['Grenoble', 'Hamburger SV', 'Holstein Kiel', 'Paris FC', 'Sochaux'] from scores_df → These teams were in promotion playoffs but somehow mixed with matchday scores. Example is https://www.flashscore.com/football/france/ligue-1-2020-2021/results/ for Grenoble and Paris FC. These results are not within the league and should be filtered

These teams did not make it to the 1st division in any season so are immediately flagged out

In [7]:
main_df['squad'] = main_df['squad'].replace(name_mapping)

In [8]:
extra_teams = ['Grenoble', 'Hamburger SV', 'Holstein Kiel', 'Paris FC', 'Sochaux']
raw_scores_df = raw_scores_df[~raw_scores_df['home_team'].isin(extra_teams) & ~raw_scores_df['away_team'].isin(extra_teams)]

In [9]:
null_cols = main_df.columns[main_df.isnull().any()]
main_df.loc[main_df.isnull().any(axis=1), ['squad', 'season_start_year', *null_cols]]

Unnamed: 0,squad,season_start_year,gk_pens_save_pct,gk_passes_length_avg,gk_avg_distance_def_actions
8,Liverpool,2018,,28.8,16.6
19,Sheffield Utd,2018,,45.2,10.1
25,Crystal Palace,2019,,48.9,13.2
40,Arsenal,2020,33.3,28.2,
69,Liverpool,2021,,22.7,17.1
102,Atl. Madrid,2018,0.0,33.3,
122,Atl. Madrid,2019,0.0,30.2,
135,Valladolid,2019,12.5,49.5,
200,Atalanta,2018,0.0,50.0,
224,Genoa,2019,0.0,,


In [10]:
print(f"Initial shape: {main_df.shape}") 
main_df.drop(columns=null_cols, inplace=True)
main_df

Initial shape: (488, 184)


Unnamed: 0,squad,players_used,avg_age,possession,games,games_starts,minutes,minutes_90s,goals,assists,...,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct,season_start_year,season_end_year
0,Arsenal,28,26.7,58.1,38,418,3420,38.0,69,52,...,89,4,7,1,2000,555,637,46.6,2018,2019
1,Bournemouth,28,26.6,46.9,38,418,3420,38.0,55,43,...,57,8,5,3,1910,701,736,48.8,2018,2019
2,Brighton,21,27.4,42.4,38,418,3420,38.0,35,24,...,90,5,10,0,1979,824,780,51.4,2018,2019
3,Burnley,23,28.0,41.2,38,418,3420,38.0,43,32,...,106,1,3,4,1955,1016,1025,49.8,2018,2019
4,Chelsea,24,27.3,62.9,38,418,3420,38.0,61,52,...,83,5,2,0,2171,523,504,50.9,2018,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,Rennes,32,25.2,55.4,38,418,3420,38.0,66,47,...,54,3,5,1,2035,538,520,50.9,2022,2023
484,Strasbourg,29,27.3,45.9,38,418,3420,38.0,48,28,...,43,6,11,0,2132,677,573,54.2,2022,2023
485,Toulouse,29,25.1,51.3,38,418,3420,38.0,50,33,...,74,2,7,3,2168,483,460,51.2,2022,2023
486,Le Havre,35,25.2,55.7,38,418,3420,38.0,43,32,...,79,8,1,2,2251,602,542,52.6,2022,2023


There are other teams who made it to the 1st division in other seasons, who also played matches in playoffs. Their name did not get flagged out previously, but we also want to remove these playoff results

In [11]:
home_df = raw_scores_df[['season_start_year', 'home_team']].rename(columns={'home_team': 'squad', 'season_start_year': 'matches_start_year'})
away_df = raw_scores_df[['season_start_year', 'away_team']].rename(columns={'away_team': 'squad', 'season_start_year': 'matches_start_year'})

all_teams_df = pd.concat([home_df, away_df], ignore_index=True)
all_teams_df = all_teams_df.drop_duplicates(subset=['matches_start_year', 'squad'])

merged_df = pd.merge(all_teams_df, main_df, how='outer', left_on=('matches_start_year', 'squad'), right_on=('season_end_year', 'squad'))

null_cols = merged_df.columns[merged_df.isnull().any()]
merged_df.loc[merged_df.isnull().any(axis=1)]

Unnamed: 0,matches_start_year,squad,players_used,avg_age,possession,games,games_starts,minutes,minutes_90s,goals,...,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct,season_start_year,season_end_year
318,2019,Heidenheim,,,,,,,,,...,,,,,,,,,,
431,2020,Toulouse,,,,,,,,,...,,,,,,,,,,
452,2021,Auxerre,,,,,,,,,...,,,,,,,,,,


- Heidenheim vs Werder Bremen was the playoff in 2019/20 on 03/07 and 07/07
- Auxerre vs St Etienne was the playoff in 2021/22 on 27/05 and 30/05
- Toulouse vs Nantes was the playoff in 2020/21 on 28/05 and 31/05

Hence, all these rows can be dropped

In [12]:
mask_1 = ~(((raw_scores_df['home_team'] == 'Heidenheim') | (raw_scores_df['away_team'] == 'Heidenheim')) & (raw_scores_df['season_start_year'] == 2019))
mask_2 = ~(((raw_scores_df['home_team'] == 'Toulouse') | (raw_scores_df['away_team'] == 'Toulouse')) & (raw_scores_df['season_start_year'] == 2020))
mask_3 = ~(((raw_scores_df['home_team'] == 'Auxerre') | (raw_scores_df['away_team'] == 'Auxerre')) & (raw_scores_df['season_start_year'] == 2021))

print(raw_scores_df.shape)
raw_scores_df = raw_scores_df[mask_1 & mask_2 & mask_3]
print(raw_scores_df.shape)
display(raw_scores_df)

(8539, 8)
(8533, 8)


Unnamed: 0,time,home_team,home_score,away_team,away_score,season_start_year,season_end_year,correct_year
0,2019-06-18 01:00:00,Aston Villa,0,Sheffield Utd,0,2019,2020,2019
1,2019-06-18 03:15:00,Manchester City,3,Arsenal,0,2019,2020,2019
2,2019-06-20 01:00:00,Norwich,0,Southampton,3,2019,2020,2019
3,2019-06-20 03:15:00,Tottenham,1,Manchester Utd,1,2019,2020,2019
4,2019-06-20 19:30:00,Watford,1,Leicester,1,2019,2020,2019
...,...,...,...,...,...,...,...,...
8544,2024-03-17 22:00:00,Reims,2,Metz,1,2023,2024,2024
8545,2024-03-17 22:00:00,Monaco,2,Lorient,2,2023,2024,2024
8546,2024-03-17 22:00:00,Clermont,2,Le Havre,1,2023,2024,2024
8547,2024-03-18 00:05:00,Rennes,2,Marseille,0,2023,2024,2024


In [15]:
print(main_df.shape)
display(main_df)

(488, 181)


Unnamed: 0,squad,players_used,avg_age,possession,games,games_starts,minutes,minutes_90s,goals,assists,...,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct,season_start_year,season_end_year
0,Arsenal,28,26.7,58.1,38,418,3420,38.0,69,52,...,89,4,7,1,2000,555,637,46.6,2018,2019
1,Bournemouth,28,26.6,46.9,38,418,3420,38.0,55,43,...,57,8,5,3,1910,701,736,48.8,2018,2019
2,Brighton,21,27.4,42.4,38,418,3420,38.0,35,24,...,90,5,10,0,1979,824,780,51.4,2018,2019
3,Burnley,23,28.0,41.2,38,418,3420,38.0,43,32,...,106,1,3,4,1955,1016,1025,49.8,2018,2019
4,Chelsea,24,27.3,62.9,38,418,3420,38.0,61,52,...,83,5,2,0,2171,523,504,50.9,2018,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,Rennes,32,25.2,55.4,38,418,3420,38.0,66,47,...,54,3,5,1,2035,538,520,50.9,2022,2023
484,Strasbourg,29,27.3,45.9,38,418,3420,38.0,48,28,...,43,6,11,0,2132,677,573,54.2,2022,2023
485,Toulouse,29,25.1,51.3,38,418,3420,38.0,50,33,...,74,2,7,3,2168,483,460,51.2,2022,2023
486,Le Havre,35,25.2,55.7,38,418,3420,38.0,43,32,...,79,8,1,2,2251,602,542,52.6,2022,2023


In [2]:
HOME = "home"
AWAY = "away"
SIDES = (HOME, AWAY)

# Create new dataframe with all unique team names
results_df = pd.DataFrame(
    set(raw_scores_df["home_team"].unique()).union(set(raw_scores_df["away_team"].unique())),
    columns=["team_name"]
).sort_values(by=["team_name"]).reset_index(drop=True)

# Initialise empty columns to 0
results_df_cols = ["points", "goals_for", "goals_against"]
results_df_cols += [f"{col}_{side}" for side in SIDES for col in results_df_cols]
for col in ["season_start_year", "season_end_year", *results_df_cols]:
    results_df[col] = 0

for idx, row in raw_scores_df.iterrows():
    for side in SIDES:
#         Find correct year and team in results dataframe -> get index to modify row with .loc[]
        target = results_df[
            (results_df["season_start_year"] == row["season_start_year"]) &
            (results_df["season_end_year"] == row["season_end_year"]) &
            (results_df["team_name"] == row[f"{side}_team"])
        ]
        if target.empty:
            results_df = pd.concat([results_df, pd.DataFrame([[
                row[f"{side}_team"], row["season_start_year"],
                row["season_end_year"], *[0 for _ in range(len(results_df.columns)-3)]
            ]], columns=results_df.columns)], ignore_index=True)
            target_idx = len(results_df)-1
        else:
            target_idx = target.index
        
#         Add stats to relevant column (home/away team)
        opp_side = HOME if side == AWAY else AWAY
        results_df.loc[target_idx, ["points", f"points_{side}"]] += (
            3 if row[f"{side}_score"] > row[f"{opp_side}_score"] else
            1 if row[f"{side}_score"] == row[f"{opp_side}_score"] else 0
        )
        results_df.loc[target_idx, ["goals_for", f"goals_for_{side}"]] += row[f"{side}_score"]
        results_df.loc[target_idx, ["goals_against", f"goals_against_{side}"]] += row[f"{opp_side}_score"]

# Drop rows initialised to 0
results_df = results_df[results_df["season_start_year"] != 0].reset_index(drop=True)

In [3]:
raw_scores_df.to_csv("./data/scores/scores.csv", index=False)
results_df.to_csv("./data/scores/results.csv", index=False)
main_df.to_csv("./data/teams/teams.csv", index=False)