In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('final_data.csv')

In [8]:
# Define the seasons and their date ranges
seasons = {
    'S2024/2025': ('2024-07-20', '2025-06-15'),
    'S2023/2024': ('2023-07-20', '2024-06-15'),
    'S2022/2023': ('2022-07-20', '2023-06-15'),
    'S2021/2022': ('2021-07-20', '2022-06-15'),
    'S2020/2021': ('2020-07-20', '2021-06-15'),
    'S2019/2020': ('2019-07-20', '2020-06-15'),
}

# Convert date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Function to determine season based on date
def determine_season(date, season_dict):
    for season, (start_date, end_date) in season_dict.items():
        if start_date <= date.strftime('%Y-%m-%d') <= end_date:
            return season
    return 'Unknown Season'  # Return if date doesn't fit any range

# Apply the function to create a season column
df['season'] = df['Date'].apply(lambda x: determine_season(x, seasons))


In [9]:

# Define a function to calculate points
def calculate_points(ftr):
    if ftr == 'H':
        return (3, 0)  # Home wins
    elif ftr == 'D':
        return (1, 1)  # Draw
    elif ftr == 'A':
        return (0, 3)  # Away wins

# Apply the function to the DataFrame
df[['HomeTeamPoints', 'AwayTeamPoints']] = df['FTR'].apply(lambda x: pd.Series(calculate_points(x)))



In [10]:
import pandas as pd

#Sum the HomeTeamPoints for each team per season
home_team_points = df.groupby(['season', 'HomeTeam'])['HomeTeamPoints'].sum().reset_index()
home_team_points = home_team_points.rename(columns={'HomeTeamPoints': 'TotalHomePoints'})

#Sum the AwayTeamPoints for each team per season
away_team_points = df.groupby(['season', 'AwayTeam'])['AwayTeamPoints'].sum().reset_index()
away_team_points = away_team_points.rename(columns={'AwayTeamPoints': 'TotalAwayPoints'})

#Rank teams based on TotalHomePoints for each season
home_team_points['HomeTeamStrength'] = home_team_points.groupby('season')['TotalHomePoints'].rank(ascending=False, method='min')

#Rank teams based on TotalAwayPoints for each season
away_team_points['AwayTeamStrength'] = away_team_points.groupby('season')['TotalAwayPoints'].rank(ascending=False, method='min')

#Merge the rankings back into the original DataFrame for HomeTeam and AwayTeam separately
df = df.merge(home_team_points[['season', 'HomeTeam', 'HomeTeamStrength']], on=['season', 'HomeTeam'], how='left')
df = df.merge(away_team_points[['season', 'AwayTeam', 'AwayTeamStrength']], on=['season', 'AwayTeam'], how='left')



In [12]:
import pandas as pd
# 'HomeTeam', 'AwayTeam', 'season', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HST', 'AST', 'HR', 'AR', 'B365H', 'B365D', 'B365A'

#Calculate averages for HomeTeam statistics per season
home_team_avg = df.groupby(['season', 'HomeTeam']).agg({
    'FTHG': 'mean',
    'HTHG': 'mean',
    'HST': 'mean',
    'HR': 'mean',
    'AvgH': 'mean',
    'AvgD': 'mean'
}).reset_index()

# Rename columns to represent HomeTeam stats
home_team_avg = home_team_avg.rename(columns={
    'FTHG': 'avgHG',   # Average Full Time Home Goals
    'HTHG': 'avgHHG',  # Average Half Time Home Goals
    'HST': 'avgHST',   # Average Home Shots on Target
    'HR': 'avgHR',      # Average Home Red Cards
    'AvgH': 'avgHomeWinOdds',
    'AvgD': 'avgHomeDrawOdds'
})

# Calculate averages for AwayTeam statistics per season
away_team_avg = df.groupby(['season', 'AwayTeam']).agg({
    'FTAG': 'mean',
    'HTAG': 'mean',
    'AST': 'mean',
    'AR': 'mean',
    'AvgA': 'mean',   
    'AvgD': 'mean'
}).reset_index()

# Rename columns to represent AwayTeam stats
away_team_avg = away_team_avg.rename(columns={
    'FTAG': 'avgAG',   # Average Full Time Away Goals
    'HTAG': 'avgHAG',  # Average Half Time Away Goals
    'AST': 'avgAST',   # Average Away Shots on Target
    'AR': 'avgAR',      # Average Away Red Cards
    'AvgA': 'avgAwayWinOdds',
    'AvgD': 'avgAwayDrawOdds'
})

# Merge these averages back into the original DataFrame
# Merge home team averages
df = df.merge(home_team_avg, on=['season', 'HomeTeam'], how='left')

# Merge away team averages
df = df.merge(away_team_avg, on=['season', 'AwayTeam'], how='left')



In [13]:
df.to_csv('ML_data_final.csv', index=False)

In [14]:
df.head(50)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,avgHST,avgHR,avgHomeWinOdds,avgHomeDrawOdds,avgAG,avgHAG,avgAST,avgAR,avgAwayWinOdds,avgAwayDrawOdds
0,B1,2024-09-01,18:15,Kortrijk,St Truiden,1.0,1.0,D,0.0,1.0,...,5.666667,0.0,3.24,3.643333,0.666667,0.333333,3.333333,0.0,5.553333,4.216667
1,B1,2024-09-01,17:30,St. Gilloise,Anderlecht,0.0,0.0,D,0.0,0.0,...,4.666667,0.0,1.73,4.2,1.666667,1.0,5.666667,0.0,2.62,3.576667
2,B1,2024-09-01,15:00,Gent,Antwerp,1.0,1.0,D,1.0,1.0,...,6.333333,0.0,1.903333,4.263333,0.666667,0.666667,5.666667,0.333333,2.986667,3.693333
3,B1,2024-09-01,12:30,Club Brugge,Cercle Brugge,3.0,0.0,H,2.0,0.0,...,7.333333,0.0,1.556667,4.436667,0.333333,0.333333,4.0,0.0,3.41,3.806667
4,B1,2024-08-31,19:45,Oud-Heverlee Leuven,Standard,2.0,0.0,H,1.0,0.0,...,5.0,0.333333,2.686667,3.456667,0.0,0.0,0.333333,0.333333,4.283333,3.73
5,B1,2024-08-31,17:15,Beerschot VA,Dender,1.0,2.0,A,0.0,2.0,...,3.666667,0.0,3.44,3.606667,2.333333,1.333333,5.333333,0.0,4.47,4.013333
6,B1,2024-08-31,15:00,Mechelen,Charleroi,5.0,2.0,H,3.0,2.0,...,6.666667,0.0,2.62,3.663333,2.0,2.0,4.666667,0.0,3.18,3.543333
7,B1,2024-08-30,19:45,Genk,Westerlo,1.0,0.0,H,0.0,0.0,...,5.0,0.0,2.17,4.13,1.666667,1.0,5.666667,0.0,3.89,4.063333
8,B1,2024-08-25,18:15,Charleroi,Kortrijk,1.0,0.0,H,1.0,0.0,...,4.333333,0.0,2.573333,3.623333,1.0,0.666667,3.0,0.333333,4.206667,3.773333
9,B1,2024-08-25,17:30,Standard,Beerschot VA,1.0,0.0,H,0.0,0.0,...,3.0,0.0,3.223333,3.87,0.666667,0.333333,2.333333,0.333333,5.913333,4.443333
