In [18]:
import time
from datetime import datetime
import re
import glob
import numpy as np
import pandas as pd

import function_library as fl

In [19]:
features = [
    # General
    'round',
    'home_team_place_total',
    'home_team_place_home',
    'away_team_place_total',
    'away_team_place_away',
    'home_odds',
    'draw_odds',
    'away_odds',
    'over_25_odds',
    'under_25_odds',
    'elo_home',
    'elo_away',
    'form_home',
    'form_away',

    # Home team overall features
    'home_Overall_RollingGoalsScored_Mean',
    'home_Overall_RollingGoalsConceded_Mean',
    'home_Overall_RollingGoalsScored_Std',
    'home_Overall_RollingGoalsConceded_Std',
    'home_Overall_RollingGoalsScored_Mean_Short',
    'home_Overall_Momentum_GoalsScored',
    'home_Overall_Trend_Slope_GoalsScored',
    'home_Overall_RollingFirstHalfGoalsScored_Mean',
    'home_Overall_RollingFirstHalfGoalsConceded_Mean',
    'home_Overall_RollingFirstHalfGoalsScored_Std',
    'home_Overall_RollingFirstHalfGoalsConceded_Std',
    'home_Overall_RollingFirstHalfGoalsScored_Mean_Short',
    'home_Overall_Momentum_FirstHalfGoalsScored',
    'home_Overall_Trend_Slope_FirstHalfGoalsScored',

    # Home team specific features (home-only)
    'home_Home_RollingGoalsScored_Mean',
    'home_Home_RollingGoalsConceded_Mean',
    'home_Home_RollingGoalsScored_Std',
    'home_Home_RollingGoalsConceded_Std',
    'home_Home_RollingGoalsScored_Mean_Short',
    'home_Home_Momentum_GoalsScored',
    'home_Home_Trend_Slope_GoalsScored',
    'home_Home_RollingFirstHalfGoalsScored_Mean',
    'home_Home_RollingFirstHalfGoalsConceded_Mean',
    'home_Home_RollingFirstHalfGoalsScored_Std',
    'home_Home_RollingFirstHalfGoalsConceded_Std',
    'home_Home_RollingFirstHalfGoalsScored_Mean_Short',
    'home_Home_Momentum_FirstHalfGoalsScored',
    'home_Home_Trend_Slope_FirstHalfGoalsScored',

    # Away team overall features
    'away_Overall_RollingGoalsScored_Mean',
    'away_Overall_RollingGoalsConceded_Mean',
    'away_Overall_RollingGoalsScored_Std',
    'away_Overall_RollingGoalsConceded_Std',
    'away_Overall_RollingGoalsScored_Mean_Short',
    'away_Overall_Momentum_GoalsScored',
    'away_Overall_Trend_Slope_GoalsScored',
    'away_Overall_RollingFirstHalfGoalsScored_Mean',
    'away_Overall_RollingFirstHalfGoalsConceded_Mean',
    'away_Overall_RollingFirstHalfGoalsScored_Std',
    'away_Overall_RollingFirstHalfGoalsConceded_Std',
    'away_Overall_RollingFirstHalfGoalsScored_Mean_Short',
    'away_Overall_Momentum_FirstHalfGoalsScored',
    'away_Overall_Trend_Slope_FirstHalfGoalsScored',

    # Away team specific features (away-only)
    'away_Away_RollingGoalsScored_Mean',
    'away_Away_RollingGoalsConceded_Mean',
    'away_Away_RollingGoalsScored_Std',
    'away_Away_RollingGoalsConceded_Std',
    'away_Away_RollingGoalsScored_Mean_Short',
    'away_Away_Momentum_GoalsScored',
    'away_Away_Trend_Slope_GoalsScored',
    'away_Away_RollingFirstHalfGoalsScored_Mean',
    'away_Away_RollingFirstHalfGoalsConceded_Mean',
    'away_Away_RollingFirstHalfGoalsScored_Std',
    'away_Away_RollingFirstHalfGoalsConceded_Std',
    'away_Away_RollingFirstHalfGoalsScored_Mean_Short',
    'away_Away_Momentum_FirstHalfGoalsScored',
    'away_Away_Trend_Slope_FirstHalfGoalsScored',

    # ----- Additional Goal Threshold Percentages (Per-Match Metrics) -----
    # For thresholds: 1.5, 2.5, 3.5
    # Overall team (season cumulative and rolling last 5 matches)
    'home_Overall_Percent_Over_1.5',
    'home_Overall_Rolling5_Percent_Over_1.5',
    'home_Overall_Percent_Over_2.5',
    'home_Overall_Rolling5_Percent_Over_2.5',
    'home_Overall_Percent_Over_3.5',
    'home_Overall_Rolling5_Percent_Over_3.5',

    'away_Overall_Percent_Over_1.5',
    'away_Overall_Rolling5_Percent_Over_1.5',
    'away_Overall_Percent_Over_2.5',
    'away_Overall_Rolling5_Percent_Over_2.5',
    'away_Overall_Percent_Over_3.5',
    'away_Overall_Rolling5_Percent_Over_3.5',

    # Home matches only
    'home_Home_Percent_Over_1.5',
    'home_Home_Rolling5_Percent_Over_1.5',
    'home_Home_Percent_Over_2.5',
    'home_Home_Rolling5_Percent_Over_2.5',
    'home_Home_Percent_Over_3.5',
    'home_Home_Rolling5_Percent_Over_3.5',

    # Away matches only
    'away_Away_Percent_Over_1.5',
    'away_Away_Rolling5_Percent_Over_1.5',
    'away_Away_Percent_Over_2.5',
    'away_Away_Rolling5_Percent_Over_2.5',
    'away_Away_Percent_Over_3.5',
    'away_Away_Rolling5_Percent_Over_3.5',

    # Home matches only
    'home_Home_TeamPct_Over_0.5',
    'home_Home_TeamPct_Over_1.5',
    'home_Home_TeamPct_Over_2.5',
    'home_Home_TeamPct_Over_3.5',

    # Away matches only
    'away_Away_TeamPct_Over_0.5',
    'away_Away_TeamPct_Over_1.5',
    'away_Away_TeamPct_Over_2.5',
    'away_Away_TeamPct_Over_3.5'
]

def read_csv_with_encodings(file_path, encodings=['utf-8', 'latin1', 'iso-8859-1', 'cp1252']):
    """Attempt to read a CSV file with a list of possible encodings."""
    for encoding in encodings:
        try:
            data = pd.read_csv(file_path, encoding=encoding, low_memory=False)
            print(f"Successfully read the file with encoding: {encoding}")
            return data
        except UnicodeDecodeError:
            print(f"Failed to decode with encoding: {encoding}")
    raise ValueError("Unable to read CSV file with provided encodings.")


# Rename and filter columns
def rename_and_filter_data(data, column_dict):
    """Rename the columns of the DataFrame and filter to only keep the specified ones."""
    data = data.rename(columns=column_dict).filter(items=column_dict.values())
    return data

def convert_date_and_filter(data):
    """Convert the date column to datetime, sort the data and filter out future dates."""
    data['date'] = pd.to_datetime(data['date'], format="%Y-%m-%d", errors='coerce')
    data = data.sort_values(by='date')
    today = pd.Timestamp(datetime.today().date())
    data = data[data['date'] <= today]
    return data

def compute_match_points(data):
    """Assign points to home and away teams based on match results."""
    data["points_home"] = data.apply(
        lambda row: 3 if row["home_goals_ft"] > row["away_goals_ft"]
        else (1 if row["home_goals_ft"] == row["away_goals_ft"] else 0),
        axis=1,
    )
    data["points_away"] = data.apply(
        lambda row: 3 if row["away_goals_ft"] > row["home_goals_ft"]
        else (1 if row["away_goals_ft"] == row["home_goals_ft"] else 0),
        axis=1,
    )
    return data

def create_team_dataframes(data):
    """Create separate DataFrames for home and away matches, then combine them."""
    # Home team DataFrame
    home_df = data[['country', 'season', 'date', 'home_team', 'away_team',
                    'home_goals_ft', 'away_goals_ft', 'home_goals_ht', 'away_goals_ht']].copy()
    home_df.rename(
        columns={
            'home_team': 'Team',
            'away_team': 'Opponent',
            'home_goals_ft': 'GoalsScored',
            'away_goals_ft': 'GoalsConceded',
            'home_goals_ht': 'FirstHalfGoalsScored',
            'away_goals_ht': 'FirstHalfGoalsConceded'
        },
        inplace=True,
    )
    home_df['is_home'] = 1

    # Away team DataFrame
    away_df = data[['country', 'season', 'date', 'away_team', 'home_team',
                    'away_goals_ft', 'home_goals_ft', 'away_goals_ht', 'home_goals_ht']].copy()
    away_df.rename(
        columns={
            'away_team': 'Team',
            'home_team': 'Opponent',
            'away_goals_ft': 'GoalsScored',
            'home_goals_ft': 'GoalsConceded',
            'away_goals_ht': 'FirstHalfGoalsScored',
            'home_goals_ht': 'FirstHalfGoalsConceded'
        },
        inplace=True,
    )
    away_df['is_home'] = 0

    # Combine both
    team_df = pd.concat([home_df, away_df], ignore_index=True)
    team_df.sort_values(by=['country', 'season', 'Team', 'date'], inplace=True)
    return team_df

def compute_rolling(df_sub, prefix, window_long=5, window_short=3):
    """
    Compute rolling metrics for goals and first-half goals.
    Returns only the computed columns with the specified prefix.
    """
    df = df_sub.copy()

    # Full-Time Goals Rolling Features
    df[prefix + '_RollingGoalsScored_Mean'] = df['GoalsScored'].rolling(window=window_long, min_periods=1).mean().shift(1)
    df[prefix + '_RollingGoalsConceded_Mean'] = df['GoalsConceded'].rolling(window=window_long, min_periods=1).mean().shift(1)
    df[prefix + '_RollingGoalsScored_Std'] = df['GoalsScored'].rolling(window=window_long, min_periods=1).std().shift(1)
    df[prefix + '_RollingGoalsConceded_Std'] = df['GoalsConceded'].rolling(window=window_long, min_periods=1).std().shift(1)
    df[prefix + '_RollingGoalsScored_Mean_Short'] = df['GoalsScored'].rolling(window=window_short, min_periods=1).mean().shift(1)
    df[prefix + '_Momentum_GoalsScored'] = df[prefix + '_RollingGoalsScored_Mean_Short'] - df[prefix + '_RollingGoalsScored_Mean']

    # First-Half Goals Rolling Features
    df[prefix + '_RollingFirstHalfGoalsScored_Mean'] = df['FirstHalfGoalsScored'].rolling(window=window_long, min_periods=1).mean().shift(1)
    df[prefix + '_RollingFirstHalfGoalsConceded_Mean'] = df['FirstHalfGoalsConceded'].rolling(window=window_long, min_periods=1).mean().shift(1)
    df[prefix + '_RollingFirstHalfGoalsScored_Std'] = df['FirstHalfGoalsScored'].rolling(window=window_long, min_periods=1).std().shift(1)
    df[prefix + '_RollingFirstHalfGoalsConceded_Std'] = df['FirstHalfGoalsConceded'].rolling(window=window_long, min_periods=1).std().shift(1)
    df[prefix + '_RollingFirstHalfGoalsScored_Mean_Short'] = df['FirstHalfGoalsScored'].rolling(window=window_short, min_periods=1).mean().shift(1)
    df[prefix + '_Momentum_FirstHalfGoalsScored'] = df[prefix + '_RollingFirstHalfGoalsScored_Mean_Short'] - df[prefix + '_RollingFirstHalfGoalsScored_Mean']

    # Compute trend slopes using a simple linear regression
    def compute_slope(x):
        if len(x) < 2:
            return np.nan
        xs = np.arange(len(x))
        return np.polyfit(xs, x, 1)[0]

    df[prefix + '_Trend_Slope_GoalsScored'] = df['GoalsScored'].rolling(window=window_long, min_periods=2).apply(compute_slope, raw=True).shift(1)
    df[prefix + '_Trend_Slope_FirstHalfGoalsScored'] = df['FirstHalfGoalsScored'].rolling(window=window_long, min_periods=2).apply(compute_slope, raw=True).shift(1)

    computed_cols = [
        prefix + '_RollingGoalsScored_Mean',
        prefix + '_RollingGoalsConceded_Mean',
        prefix + '_RollingGoalsScored_Std',
        prefix + '_RollingGoalsConceded_Std',
        prefix + '_RollingGoalsScored_Mean_Short',
        prefix + '_Momentum_GoalsScored',
        prefix + '_Trend_Slope_GoalsScored',
        prefix + '_RollingFirstHalfGoalsScored_Mean',
        prefix + '_RollingFirstHalfGoalsConceded_Mean',
        prefix + '_RollingFirstHalfGoalsScored_Std',
        prefix + '_RollingFirstHalfGoalsConceded_Std',
        prefix + '_RollingFirstHalfGoalsScored_Mean_Short',
        prefix + '_Momentum_FirstHalfGoalsScored',
        prefix + '_Trend_Slope_FirstHalfGoalsScored'
    ]
    return df[computed_cols]

def add_rolling_features_split(group, window_long=5, window_short=3):
    """
    Compute rolling features for a team group.
    Applies overall, home-only, and away-only calculations.
    """
    group = group.sort_values(by='date').reset_index(drop=True)
    overall_features = compute_rolling(group, 'Overall', window_long, window_short)
    group = pd.concat([group, overall_features], axis=1)

    home_mask = group['is_home'] == 1
    if home_mask.sum() > 0:
        group_home = group.loc[home_mask].copy()
        home_features = compute_rolling(group_home, 'Home', window_long, window_short)
        for col in home_features.columns:
            group.loc[home_mask, col] = home_features[col].values

    away_mask = group['is_home'] == 0
    if away_mask.sum() > 0:
        group_away = group.loc[away_mask].copy()
        away_features = compute_rolling(group_away, 'Away', window_long, window_short)
        for col in away_features.columns:
            group.loc[away_mask, col] = away_features[col].values

    # Additional Goal Threshold Percentages
    for threshold in [1.5, 2.5, 3.5]:
        overall_season_col = f'Overall_Percent_Over_{threshold}'
        overall_rolling_col = f'Overall_Rolling5_Percent_Over_{threshold}'
        indicator_overall = group['GoalsScored'].gt(threshold)
        group[overall_season_col] = indicator_overall.shift(1).expanding(min_periods=1).mean()
        group[overall_rolling_col] = indicator_overall.shift(1).rolling(window=5, min_periods=1).mean()

        season_col_home = f'Home_Percent_Over_{threshold}'
        rolling_col_home = f'Home_Rolling5_Percent_Over_{threshold}'
        if home_mask.sum() > 0:
            indicator_home = group.loc[home_mask, 'GoalsScored'].gt(threshold)
            group.loc[home_mask, season_col_home] = indicator_home.shift(1).expanding(min_periods=1).mean()
            group.loc[home_mask, rolling_col_home] = indicator_home.shift(1).rolling(window=5, min_periods=1).mean()

        season_col_away = f'Away_Percent_Over_{threshold}'
        rolling_col_away = f'Away_Rolling5_Percent_Over_{threshold}'
        if away_mask.sum() > 0:
            indicator_away = group.loc[away_mask, 'GoalsScored'].gt(threshold)
            group.loc[away_mask, season_col_away] = indicator_away.shift(1).expanding(min_periods=1).mean()
            group.loc[away_mask, rolling_col_away] = indicator_away.shift(1).rolling(window=5, min_periods=1).mean()

    for threshold in [0.5, 1.5, 2.5, 3.5]:
        overall_col = f'TeamPct_Over_{threshold}'
        indicator_overall = group['GoalsScored'].gt(threshold)
        group[overall_col] = indicator_overall.shift(1).expanding(min_periods=1).mean()

        home_col = f'Home_TeamPct_Over_{threshold}'
        if home_mask.sum() > 0:
            indicator_home = group.loc[home_mask, 'GoalsScored'].gt(threshold)
            group.loc[home_mask, home_col] = indicator_home.shift(1).expanding(min_periods=1).mean()

        away_col = f'Away_TeamPct_Over_{threshold}'
        if away_mask.sum() > 0:
            indicator_away = group.loc[away_mask, 'GoalsScored'].gt(threshold)
            group.loc[away_mask, away_col] = indicator_away.shift(1).expanding(min_periods=1).mean()
    return group

def compute_team_features(team_df):
    """Apply rolling feature engineering on a team-level DataFrame group-wise."""
    team_df = team_df.groupby(['country', 'season', 'Team'], group_keys=False)\
                     .apply(lambda group: add_rolling_features_split(group))\
                     .reset_index(drop=True)
    return team_df

def process_team_side_features(team_df, side='home'):
    """
    Process side-specific features (home or away) from the team-level DataFrame.
    Returns a DataFrame of features keyed by team and match.
    """
    if side == 'home':
        subset = team_df[team_df['is_home'] == 1].copy()
        subset = subset.drop(columns=['Opponent'])
        subset.rename(columns={'Team': 'home_team'}, inplace=True)
        key_cols = ['country', 'season', 'date', 'home_team', 'is_home']
        feature_cols = [col for col in subset.columns
                        if col not in key_cols and (col.startswith("Overall_") or col.startswith("Home_"))]
        features = subset[key_cols + feature_cols].copy()
        features.rename(columns={col: "home_" + col for col in feature_cols}, inplace=True)
    else:
        subset = team_df[team_df['is_home'] == 0].copy()
        subset = subset.drop(columns=['Opponent'])
        subset.rename(columns={'Team': 'away_team'}, inplace=True)
        key_cols = ['country', 'season', 'date', 'away_team', 'is_home']
        feature_cols = [col for col in subset.columns
                        if col not in key_cols and (col.startswith("Overall_") or col.startswith("Away_"))]
        features = subset[key_cols + feature_cols].copy()
        features.rename(columns={col: "away_" + col for col in feature_cols}, inplace=True)
    return features

def merge_match_features(match_df, home_features, away_features):
    """Merge home and away features back into the match-level DataFrame and finalise it."""
    match_df = match_df.merge(home_features, on=['country', 'season', 'date', 'home_team'], how='left')
    match_df = match_df.merge(away_features, on=['country', 'season', 'date', 'away_team'], how='left')
    match_df.dropna(inplace=True)
    match_df['total_goals'] = match_df['home_goals_ft'] + match_df['away_goals_ft']
    match_df['target'] = match_df['total_goals'].apply(lambda x: 1 if x > 2.5 else 0)
    return match_df

# def prepare_data(file_path):
#     """Main function to prepare data by chaining the helper functions."""
#     column_dict = {
#         "country": "country",
#         "league": "league",
#         "sezonul": "season",
#         "datameci": "date",
#         "orameci": "ko_time",
#         "etapa": "round",
#         "txtechipa1": "home_team",
#         "txtechipa2": "away_team",
#         "scor1": "home_goals_ft",
#         "scor2": "away_goals_ft",
#         "scorp1": "home_goals_ht",
#         "scorp2": "away_goals_ht",
#         "place1": "home_team_place_total",
#         "place1a": "home_team_place_home",
#         "place2": "away_team_place_total",
#         "place2d": "away_team_place_away",
#         "cotaa": "home_odds",
#         "cotae": "draw_odds",
#         "cotad": "away_odds",
#         "cotao": "over_25_odds",
#         "cotau": "under_25_odds",
#         "elohomeo": "elo_home",
#         "eloawayo": "elo_away",
#         "formah": "form_home",
#         "formaa": "form_away",
#         "suth": "shots_home",
#         "suth1": "shots_home_1h",
#         "suth2": "shots_home_2h",
#         "suta": "shots_away",
#         "suta1": "shots_away_1h",
#         "suta2": "shots_away_2h",
#         "sutht": "shots_on_target_home",
#         "sutht1": "shots_on_target_home_1h",
#         "sutht2": "shots_on_target_home_2h",
#         "sutat": "shots_on_target_away",
#         "sutat1": "shots_on_target_away_1h",
#         "sutat2": "shots_on_target_away_2h",
#         "corh": "corners_home",
#         "corh1": "corners_home_1h",
#         "corh2": "corners_home_2h",
#         "cora": "corners_away",
#         "cora1": "corners_away_1h",
#         "cora2": "corners_away_2h",
#         "foulsh": "fouls_home",
#         "foulsh1": "fouls_home_1h",
#         "foulsh2": "fouls_home_2h",
#         "foulsa": "fouls_away",
#         "foulsa1": "fouls_away_1h",
#         "foulsa2": "fouls_away_2h",
#         "yellowh": "yellow_cards_home",
#         "yellowh1": "yellow_cards_home_1h",
#         "yellowh2": "yellow_cards_home_2h",
#         "yellowa": "yellow_cards_away",
#         "yellowa1": "yellow_cards_away_1h",
#         "yellowa2": "yellow_cards_away_2h",
#         "ballph": "possession_home",
#         "ballph1": "possession_home_1h",
#         "ballph2": "possession_home_2h",
#         "ballpa": "possession_away",
#         "ballpa1": "possession_away_1h",
#         "ballpa2": "possession_away_2h",
#         "gsh": "goals_scored_total_home",
#         "gch": "goals_conceded_total_home",
#         "gsa": "goals_scored_total_away",
#         "gca": "goals_conceded_total_away",
#     }
#
#     # Step 1: Read, rename and filter the raw data
#     data = read_csv_with_encodings(file_path)
#     data = rename_and_filter_data(data, column_dict)
#     data = convert_date_and_filter(data)
#     data = compute_match_points(data)
#
#     # Step 2: Create team-level DataFrames
#     team_df = create_team_dataframes(data)
#     team_df = compute_team_features(team_df)
#
#     # Step 3: Process side-specific features
#     home_features = process_team_side_features(team_df, side='home')
#     away_features = process_team_side_features(team_df, side='away')
#
#     # Step 4: Merge features back into match-level data and finalise
#     match_df = merge_match_features(data, home_features, away_features)
#     return match_df

In [16]:
#data = prepare_data(r"C:\Users\leere\PycharmProjects\Football_ML3\engineered_master_data_ALL_2014.csv")

Successfully read the file with encoding: utf-8


  .apply(lambda group: add_rolling_features_split(group))\


In [22]:
file_path = r"C:\Users\leere\PycharmProjects\Football_ML3\engineered_master_data_ALL_2014.csv"
"""Main function to prepare data by chaining the helper functions."""
column_dict = {
    "country": "country",
    "league": "league",
    "sezonul": "season",
    "datameci": "date",
    "orameci": "ko_time",
    "etapa": "round",
    "txtechipa1": "home_team",
    "txtechipa2": "away_team",
    "scor1": "home_goals_ft",
    "scor2": "away_goals_ft",
    "scorp1": "home_goals_ht",
    "scorp2": "away_goals_ht",
    "place1": "home_team_place_total",
    "place1a": "home_team_place_home",
    "place2": "away_team_place_total",
    "place2d": "away_team_place_away",
    "cotaa": "home_odds",
    "cotae": "draw_odds",
    "cotad": "away_odds",
    "cotao": "over_25_odds",
    "cotau": "under_25_odds",
    "elohomeo": "elo_home",
    "eloawayo": "elo_away",
    "formah": "form_home",
    "formaa": "form_away",
    "suth": "shots_home",
    "suth1": "shots_home_1h",
    "suth2": "shots_home_2h",
    "suta": "shots_away",
    "suta1": "shots_away_1h",
    "suta2": "shots_away_2h",
    "sutht": "shots_on_target_home",
    "sutht1": "shots_on_target_home_1h",
    "sutht2": "shots_on_target_home_2h",
    "sutat": "shots_on_target_away",
    "sutat1": "shots_on_target_away_1h",
    "sutat2": "shots_on_target_away_2h",
    "corh": "corners_home",
    "corh1": "corners_home_1h",
    "corh2": "corners_home_2h",
    "cora": "corners_away",
    "cora1": "corners_away_1h",
    "cora2": "corners_away_2h",
    "foulsh": "fouls_home",
    "foulsh1": "fouls_home_1h",
    "foulsh2": "fouls_home_2h",
    "foulsa": "fouls_away",
    "foulsa1": "fouls_away_1h",
    "foulsa2": "fouls_away_2h",
    "yellowh": "yellow_cards_home",
    "yellowh1": "yellow_cards_home_1h",
    "yellowh2": "yellow_cards_home_2h",
    "yellowa": "yellow_cards_away",
    "yellowa1": "yellow_cards_away_1h",
    "yellowa2": "yellow_cards_away_2h",
    "ballph": "possession_home",
    "ballph1": "possession_home_1h",
    "ballph2": "possession_home_2h",
    "ballpa": "possession_away",
    "ballpa1": "possession_away_1h",
    "ballpa2": "possession_away_2h",
    "gsh": "goals_scored_total_home",
    "gch": "goals_conceded_total_home",
    "gsa": "goals_scored_total_away",
    "gca": "goals_conceded_total_away",
}

# Step 1: Read, rename and filter the raw data
data = read_csv_with_encodings(file_path)
data = rename_and_filter_data(data, column_dict)
data = convert_date_and_filter(data)
data = compute_match_points(data)
data

Successfully read the file with encoding: utf-8


Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,possession_home_2h,possession_away,possession_away_1h,possession_away_2h,goals_scored_total_home,goals_conceded_total_home,goals_scored_total_away,goals_conceded_total_away,points_home,points_away
0,Aus2,15,2014-08-08,1730.0,5,Lustenau,Hartberg,5,0,2,...,0,0,0,0,3,5,0,10,3,0
1,Aus2,15,2014-08-08,1930.0,5,Innsbruck,FAC Wien,2,1,0,...,0,0,0,0,1,5,5,4,3,0
2,Aus2,15,2014-08-08,1730.0,5,KSV 1919,Horn,0,0,0,...,0,0,0,0,5,6,4,5,1,1
3,Aus2,15,2014-08-08,1730.0,5,LASK,Mattersburg,1,0,0,...,0,0,0,0,4,2,10,0,3,0
4,Slk1,15,2014-08-09,1830.0,5,Zlate Moravce,Spartak Myjava,1,0,1,...,0,0,0,0,4,5,3,4,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56750,Ita1,20,2019-10-30,2000.0,10,Udinese,Roma,0,4,0,...,0,49,0,0,5,13,14,11,0,3
56751,Ita1,20,2019-10-30,2000.0,10,Sassuolo,Fiorentina,1,2,1,...,0,47,0,0,15,16,13,12,0,3
56752,Ita1,20,2019-10-30,2000.0,10,Sampdoria,Lecce,1,1,0,...,0,48,0,0,5,18,10,18,1,1
56753,Ita1,20,2019-10-30,2000.0,10,Juventus,Genoa,2,1,1,...,0,51,0,0,16,8,12,21,3,0


In [29]:
data_filtered = data[(data['country']=='Hun1')]
data_filtered

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,possession_home_2h,possession_away,possession_away_1h,possession_away_2h,goals_scored_total_home,goals_conceded_total_home,goals_scored_total_away,goals_conceded_total_away,points_home,points_away
38,Hun1,15,2014-08-22,1800.0,5,Honved,Puskas Akademia,1,0,0,...,0,0,0,0,4,6,4,7,3,0
84,Hun1,15,2014-08-23,1930.0,5,Lombard Papa,Gyori ETO,0,0,0,...,0,0,0,0,5,6,5,7,1,1
85,Hun1,15,2014-08-23,1730.0,5,Vidi,MTK Budapest,5,0,3,...,0,0,0,0,11,2,7,3,3,0
86,Hun1,15,2014-08-23,1730.0,5,Kecskemeti,Dunaujvaros,0,0,0,...,0,0,0,0,4,9,1,7,1,1
87,Hun1,15,2014-08-23,1530.0,5,Paksi,Ujpest,0,0,0,...,0,0,0,0,9,1,4,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56473,Hun1,20,2019-10-26,1600.0,10,Honved,Kaposvari,2,0,2,...,0,46,0,0,11,15,7,17,3,0
56458,Hun1,20,2019-10-26,1330.0,10,Mezokovesdi,Debreceni,3,1,1,...,0,56,0,0,12,7,16,14,3,0
56457,Hun1,20,2019-10-26,1600.0,10,Ujpest,Zalaegerszeg,0,0,0,...,0,39,0,0,12,15,14,13,1,1
56474,Hun1,20,2019-10-26,1600.0,10,Paksi,Diosgyori,1,2,0,...,0,51,0,0,11,17,7,17,0,3


In [21]:
# Step 2: Create team-level DataFrames
team_df = create_team_dataframes(data)
team_df = compute_team_features(team_df)
#team_df


  .apply(lambda group: add_rolling_features_split(group))\


Unnamed: 0,country,season,date,Team,Opponent,GoalsScored,GoalsConceded,FirstHalfGoalsScored,FirstHalfGoalsConceded,is_home,...,Away_TeamPct_Over_0.5,TeamPct_Over_1.5,Home_TeamPct_Over_1.5,Away_TeamPct_Over_1.5,TeamPct_Over_2.5,Home_TeamPct_Over_2.5,Away_TeamPct_Over_2.5,TeamPct_Over_3.5,Home_TeamPct_Over_3.5,Away_TeamPct_Over_3.5
0,Aus1,15,2014-08-16,Admira Wacker,FC Salzburg,0,3,0,1,1,...,,,,,,,,,,
1,Aus1,15,2014-08-23,Admira Wacker,Wiener Neustadt,4,5,1,1,0,...,,0.000000,,,0.000000,,,0.000000,,
2,Aus1,15,2014-08-30,Admira Wacker,Austria Vienna,2,1,2,0,1,...,,0.500000,0.0,,0.500000,0.0,,0.500000,0.0,
3,Aus1,15,2014-09-13,Admira Wacker,Graz,2,0,1,0,0,...,1.0,0.666667,,1.0,0.333333,,1.0,0.333333,,1.0
4,Aus1,15,2014-09-20,Admira Wacker,Grodig,0,0,0,0,1,...,,0.750000,0.5,,0.250000,0.0,,0.250000,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113507,Tur1,20,2019-09-23,Trabzonspor,Sivasspor,1,2,1,1,0,...,,,,,,,,,,
113508,Tur1,20,2019-09-29,Trabzonspor,Besiktas,4,1,2,0,1,...,,0.000000,,,0.000000,,,0.000000,,
113509,Tur1,20,2019-10-06,Trabzonspor,Rizespor,2,1,0,1,0,...,1.0,0.500000,,0.0,0.500000,,0.0,0.500000,,0.0
113510,Tur1,20,2019-10-19,Trabzonspor,Gaziantep,4,1,2,0,1,...,,0.666667,1.0,,0.333333,1.0,,0.333333,1.0,


In [26]:
team_df_filters = team_df[(team_df['country']=="Hun1")]
team_df_filters

Unnamed: 0,country,season,date,Team,Opponent,GoalsScored,GoalsConceded,FirstHalfGoalsScored,FirstHalfGoalsConceded,is_home,...,Away_TeamPct_Over_0.5,TeamPct_Over_1.5,Home_TeamPct_Over_1.5,Away_TeamPct_Over_1.5,TeamPct_Over_2.5,Home_TeamPct_Over_2.5,Away_TeamPct_Over_2.5,TeamPct_Over_3.5,Home_TeamPct_Over_3.5,Away_TeamPct_Over_3.5
55906,Hun1,15,2014-08-31,Debreceni,Diosgyori,1,1,1,0,0,...,,,,,,,,,,
55907,Hun1,15,2014-09-13,Debreceni,Pecs,0,2,0,0,1,...,,0.000000,,,0.000000,,,0.0,,
55908,Hun1,15,2014-09-21,Debreceni,Kecskemeti,0,0,0,0,1,...,,0.000000,0.0,,0.000000,0.0,,0.0,0.0,
55909,Hun1,15,2014-09-27,Debreceni,Paksi,1,2,1,1,0,...,1.0,0.000000,,0.0,0.000000,,0.0,0.0,,0.0
55910,Hun1,15,2014-10-04,Debreceni,Honved,4,0,2,0,1,...,,0.000000,0.0,,0.000000,0.0,,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57857,Hun1,20,2019-09-14,Zalaegerszeg,Diosgyori,0,1,0,1,0,...,,,,,,,,,,
57858,Hun1,20,2019-09-28,Zalaegerszeg,Paksi,3,1,2,1,1,...,,0.000000,,,0.000000,,,0.0,,
57859,Hun1,20,2019-10-05,Zalaegerszeg,Kisvarda Master Good,3,3,2,0,0,...,0.0,0.500000,,0.0,0.500000,,0.0,0.0,,0.0
57860,Hun1,20,2019-10-19,Zalaegerszeg,Honved,0,1,0,0,1,...,,0.666667,1.0,,0.666667,1.0,,0.0,0.0,


In [None]:
# Step 3: Process side-specific features
home_features = process_team_side_features(team_df, side='home')
away_features = process_team_side_features(team_df, side='away')

# Step 4: Merge features back into match-level data and finalise
match_df = merge_match_features(data, home_features, away_features)

In [17]:
matches_filtered = data[(data['country'] == "Hun1")]
matches_filtered

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_Overall_Percent_Over_3.5,away_Overall_Rolling5_Percent_Over_3.5,away_Away_Percent_Over_3.5,away_Away_Rolling5_Percent_Over_3.5,away_Away_TeamPct_Over_0.5,away_Away_TeamPct_Over_1.5,away_Away_TeamPct_Over_2.5,away_Away_TeamPct_Over_3.5,total_goals,target
941,Hun1,15,2014-09-27,1930.0,9,Lombard Papa,Ujpest,0,0,0,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0,0
1103,Hun1,15,2014-10-03,1800.0,10,Puskas Akademia,Gyori ETO,2,1,1,...,0.250000,0.25,0.000000,0.0,0.500000,0.500000,0.500000,0.000000,3,1
1217,Hun1,15,2014-10-04,1500.0,10,Debreceni,Honved,4,0,2,...,0.000000,0.00,0.000000,0.0,0.500000,0.000000,0.000000,0.000000,4,1
1218,Hun1,15,2014-10-04,1900.0,10,Ujpest,Dunaujvaros,3,0,1,...,0.000000,0.00,0.000000,0.0,0.400000,0.400000,0.200000,0.000000,3,1
1220,Hun1,15,2014-10-04,1300.0,10,Diosgyori,Ferencvarosi,2,1,1,...,0.000000,0.00,0.000000,0.0,1.000000,0.000000,0.000000,0.000000,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155446,Hun1,19,2019-05-19,1600.0,33,Diosgyori,Ujpest,3,0,1,...,0.076923,0.00,0.076923,0.0,0.615385,0.384615,0.076923,0.076923,3,1
155447,Hun1,19,2019-05-19,1600.0,33,Debreceni,Paksi,4,1,3,...,0.035714,0.00,0.000000,0.0,0.500000,0.142857,0.000000,0.000000,5,1
159122,Hun1,20,2019-10-26,1600.0,10,Kisvarda Master Good,Puskas Akademia,0,1,0,...,0.250000,0.25,0.000000,0.0,1.000000,0.500000,0.000000,0.000000,1,0
159124,Hun1,20,2019-10-26,1600.0,10,Honved,Kaposvari,2,0,2,...,0.000000,0.00,0.000000,0.0,0.666667,0.000000,0.000000,0.000000,2,0
