In [1]:
import time
from datetime import datetime
import re
import glob
import numpy as np
import pandas as pd

import function_library as fl

features = [
    # General
    'round',
    'home_team_place_total',
    'home_team_place_home',
    'away_team_place_total',
    'away_team_place_away',
    'home_odds',
    'draw_odds',
    'away_odds',
    'over_25_odds',
    'under_25_odds',
    'elo_home',
    'elo_away',
    'form_home',
    'form_away',

    # Home team overall features
    'home_Overall_RollingGoalsScored_Mean',
    'home_Overall_RollingGoalsConceded_Mean',
    'home_Overall_RollingGoalsScored_Std',
    'home_Overall_RollingGoalsConceded_Std',
    'home_Overall_RollingGoalsScored_Mean_Short',
    'home_Overall_Momentum_GoalsScored',
    'home_Overall_Trend_Slope_GoalsScored',
    'home_Overall_RollingFirstHalfGoalsScored_Mean',
    'home_Overall_RollingFirstHalfGoalsConceded_Mean',
    'home_Overall_RollingFirstHalfGoalsScored_Std',
    'home_Overall_RollingFirstHalfGoalsConceded_Std',
    'home_Overall_RollingFirstHalfGoalsScored_Mean_Short',
    'home_Overall_Momentum_FirstHalfGoalsScored',
    'home_Overall_Trend_Slope_FirstHalfGoalsScored',

    # Home team specific features (home-only)
    'home_Home_RollingGoalsScored_Mean',
    'home_Home_RollingGoalsConceded_Mean',
    'home_Home_RollingGoalsScored_Std',
    'home_Home_RollingGoalsConceded_Std',
    'home_Home_RollingGoalsScored_Mean_Short',
    'home_Home_Momentum_GoalsScored',
    'home_Home_Trend_Slope_GoalsScored',
    'home_Home_RollingFirstHalfGoalsScored_Mean',
    'home_Home_RollingFirstHalfGoalsConceded_Mean',
    'home_Home_RollingFirstHalfGoalsScored_Std',
    'home_Home_RollingFirstHalfGoalsConceded_Std',
    'home_Home_RollingFirstHalfGoalsScored_Mean_Short',
    'home_Home_Momentum_FirstHalfGoalsScored',
    'home_Home_Trend_Slope_FirstHalfGoalsScored',

    # Away team overall features
    'away_Overall_RollingGoalsScored_Mean',
    'away_Overall_RollingGoalsConceded_Mean',
    'away_Overall_RollingGoalsScored_Std',
    'away_Overall_RollingGoalsConceded_Std',
    'away_Overall_RollingGoalsScored_Mean_Short',
    'away_Overall_Momentum_GoalsScored',
    'away_Overall_Trend_Slope_GoalsScored',
    'away_Overall_RollingFirstHalfGoalsScored_Mean',
    'away_Overall_RollingFirstHalfGoalsConceded_Mean',
    'away_Overall_RollingFirstHalfGoalsScored_Std',
    'away_Overall_RollingFirstHalfGoalsConceded_Std',
    'away_Overall_RollingFirstHalfGoalsScored_Mean_Short',
    'away_Overall_Momentum_FirstHalfGoalsScored',
    'away_Overall_Trend_Slope_FirstHalfGoalsScored',

    # Away team specific features (away-only)
    'away_Away_RollingGoalsScored_Mean',
    'away_Away_RollingGoalsConceded_Mean',
    'away_Away_RollingGoalsScored_Std',
    'away_Away_RollingGoalsConceded_Std',
    'away_Away_RollingGoalsScored_Mean_Short',
    'away_Away_Momentum_GoalsScored',
    'away_Away_Trend_Slope_GoalsScored',
    'away_Away_RollingFirstHalfGoalsScored_Mean',
    'away_Away_RollingFirstHalfGoalsConceded_Mean',
    'away_Away_RollingFirstHalfGoalsScored_Std',
    'away_Away_RollingFirstHalfGoalsConceded_Std',
    'away_Away_RollingFirstHalfGoalsScored_Mean_Short',
    'away_Away_Momentum_FirstHalfGoalsScored',
    'away_Away_Trend_Slope_FirstHalfGoalsScored',

    # ----- Additional Goal Threshold Percentages (Per-Match Metrics) -----
    # For thresholds: 1.5, 2.5, 3.5
    # Overall team (season cumulative and rolling last 5 matches)
    'home_Overall_Percent_Over_1.5',
    'home_Overall_Rolling5_Percent_Over_1.5',
    'home_Overall_Percent_Over_2.5',
    'home_Overall_Rolling5_Percent_Over_2.5',
    'home_Overall_Percent_Over_3.5',
    'home_Overall_Rolling5_Percent_Over_3.5',

    'away_Overall_Percent_Over_1.5',
    'away_Overall_Rolling5_Percent_Over_1.5',
    'away_Overall_Percent_Over_2.5',
    'away_Overall_Rolling5_Percent_Over_2.5',
    'away_Overall_Percent_Over_3.5',
    'away_Overall_Rolling5_Percent_Over_3.5',

    # Home matches only
    'home_Home_Percent_Over_1.5',
    'home_Home_Rolling5_Percent_Over_1.5',
    'home_Home_Percent_Over_2.5',
    'home_Home_Rolling5_Percent_Over_2.5',
    'home_Home_Percent_Over_3.5',
    'home_Home_Rolling5_Percent_Over_3.5',

    # Away matches only
    'away_Away_Percent_Over_1.5',
    'away_Away_Rolling5_Percent_Over_1.5',
    'away_Away_Percent_Over_2.5',
    'away_Away_Rolling5_Percent_Over_2.5',
    'away_Away_Percent_Over_3.5',
    'away_Away_Rolling5_Percent_Over_3.5',

    # Home matches only
    'home_Home_TeamPct_Over_0.5',
    'home_Home_TeamPct_Over_1.5',
    'home_Home_TeamPct_Over_2.5',
    'home_Home_TeamPct_Over_3.5',

    # Away matches only
    'away_Away_TeamPct_Over_0.5',
    'away_Away_TeamPct_Over_1.5',
    'away_Away_TeamPct_Over_2.5',
    'away_Away_TeamPct_Over_3.5'
]


def prepare_data(file_path):
    # Attempt to read the CSV file using different encodings
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    for encoding in encodings:
        try:
            data = pd.read_csv(file_path, encoding=encoding, low_memory=False)
            print(f"Successfully read the file with encoding: {encoding}")
            break
        except UnicodeDecodeError:
            print(f"Failed to decode with encoding: {encoding}")

    # Define the column renaming dictionary
    column_dict = {
        "country": "country",
        "league": "league",
        "sezonul": "season",
        "datameci": "date",
        "orameci": "ko_time",
        "etapa": "round",
        "txtechipa1": "home_team",
        "txtechipa2": "away_team",
        "scor1": "home_goals_ft",
        "scor2": "away_goals_ft",
        "scorp1": "home_goals_ht",
        "scorp2": "away_goals_ht",
        "place1": "home_team_place_total",
        "place1a": "home_team_place_home",
        "place2": "away_team_place_total",
        "place2d": "away_team_place_away",
        "cotaa": "home_odds",
        "cotae": "draw_odds",
        "cotad": "away_odds",
        "cotao": "over_25_odds",
        "cotau": "under_25_odds",
        "elohomeo": "elo_home",
        "eloawayo": "elo_away",
        "formah": "form_home",
        "formaa": "form_away",
        "suth": "shots_home",
        "suth1": "shots_home_1h",
        "suth2": "shots_home_2h",
        "suta": "shots_away",
        "suta1": "shots_away_1h",
        "suta2": "shots_away_2h",
        "sutht": "shots_on_target_home",
        "sutht1": "shots_on_target_home_1h",
        "sutht2": "shots_on_target_home_2h",
        "sutat": "shots_on_target_away",
        "sutat1": "shots_on_target_away_1h",
        "sutat2": "shots_on_target_away_2h",
        "corh": "corners_home",
        "corh1": "corners_home_1h",
        "corh2": "corners_home_2h",
        "cora": "corners_away",
        "cora1": "corners_away_1h",
        "cora2": "corners_away_2h",
        "foulsh": "fouls_home",
        "foulsh1": "fouls_home_1h",
        "foulsh2": "fouls_home_2h",
        "foulsa": "fouls_away",
        "foulsa1": "fouls_away_1h",
        "foulsa2": "fouls_away_2h",
        "yellowh": "yellow_cards_home",
        "yellowh1": "yellow_cards_home_1h",
        "yellowh2": "yellow_cards_home_2h",
        "yellowa": "yellow_cards_away",
        "yellowa1": "yellow_cards_away_1h",
        "yellowa2": "yellow_cards_away_2h",
        "ballph": "possession_home",
        "ballph1": "possession_home_1h",
        "ballph2": "possession_home_2h",
        "ballpa": "possession_away",
        "ballpa1": "possession_away_1h",
        "ballpa2": "possession_away_2h",
        "gsh": "goals_scored_total_home",
        "gch": "goals_conceded_total_home",
        "gsa": "goals_scored_total_away",
        "gca": "goals_conceded_total_away",
    }

    # Rename and filter columns
    data = data.rename(columns=column_dict).filter(items=column_dict.values())

    # Convert 'date' column to datetime object
    data['date'] = pd.to_datetime(data['date'], format="%Y-%m-%d", errors='coerce')
    data = data.sort_values(by='date')

    # Convert today's date to a pandas Timestamp for compatibility.
    today = pd.Timestamp(datetime.today().date())
    data = data[data['date'] <= today]

    # Assign points based on match results
    data["points_home"] = data.apply(
        lambda row: 3 if row["home_goals_ft"] > row["away_goals_ft"]
        else (1 if row["home_goals_ft"] == row["away_goals_ft"] else 0),
        axis=1,
    )
    data["points_away"] = data.apply(
        lambda row: 3 if row["away_goals_ft"] > row["home_goals_ft"]
        else (1 if row["away_goals_ft"] == row["home_goals_ft"] else 0),
        axis=1,
    )

    # -----------------------------
    # Prepare team-level DataFrame
    # -----------------------------
    # Create a home team DataFrame
    home_df = data[
        ['country', 'season', 'date', 'home_team', 'away_team',
         'home_goals_ft', 'away_goals_ft', 'home_goals_ht', 'away_goals_ht']
    ].copy()
    home_df.rename(
        columns={
            'home_team': 'Team',
            'away_team': 'Opponent',
            'home_goals_ft': 'GoalsScored',
            'away_goals_ft': 'GoalsConceded',
            'home_goals_ht': 'FirstHalfGoalsScored',
            'away_goals_ht': 'FirstHalfGoalsConceded'
        },
        inplace=True,
    )
    home_df['is_home'] = 1

    # Create an away team DataFrame
    away_df = data[
        ['country', 'season', 'date', 'away_team', 'home_team',
         'away_goals_ft', 'home_goals_ft', 'away_goals_ht', 'home_goals_ht']
    ].copy()
    away_df.rename(
        columns={
            'away_team': 'Team',
            'home_team': 'Opponent',
            'away_goals_ft': 'GoalsScored',
            'home_goals_ft': 'GoalsConceded',
            'away_goals_ht': 'FirstHalfGoalsScored',
            'home_goals_ht': 'FirstHalfGoalsConceded'
        },
        inplace=True,
    )
    away_df['is_home'] = 0

    # Combine the home and away DataFrames into one team-level DataFrame
    team_df = pd.concat([home_df, away_df], ignore_index=True)
    team_df.sort_values(by=['country', 'season', 'Team', 'date'], inplace=True)

    # Define rolling window sizes
    window_long = 5  # e.g. last 5 matches for long-term trends
    window_short = 3  # e.g. last 3 matches for short-term momentum

    # -----------------------------
    # Rolling Feature Computation Functions
    # -----------------------------
    def compute_rolling(df_sub, prefix):
        """
        Compute rolling features on a given (sorted) DataFrame subset.
        The prefix will be used to name the computed columns.
        """
        df = df_sub.copy()

        # Full-Time Goals Rolling Features
        df[prefix + '_RollingGoalsScored_Mean'] = df['GoalsScored'].rolling(window=window_long,
                                                                            min_periods=1).mean().shift(1)
        df[prefix + '_RollingGoalsConceded_Mean'] = df['GoalsConceded'].rolling(window=window_long,
                                                                                min_periods=1).mean().shift(1)
        df[prefix + '_RollingGoalsScored_Std'] = df['GoalsScored'].rolling(window=window_long,
                                                                           min_periods=1).std().shift(1)
        df[prefix + '_RollingGoalsConceded_Std'] = df['GoalsConceded'].rolling(window=window_long,
                                                                               min_periods=1).std().shift(1)
        df[prefix + '_RollingGoalsScored_Mean_Short'] = df['GoalsScored'].rolling(window=window_short,
                                                                                  min_periods=1).mean().shift(1)
        df[prefix + '_Momentum_GoalsScored'] = df[prefix + '_RollingGoalsScored_Mean_Short'] - df[
            prefix + '_RollingGoalsScored_Mean']

        # First-Half Goals Rolling Features
        df[prefix + '_RollingFirstHalfGoalsScored_Mean'] = df['FirstHalfGoalsScored'].rolling(window=window_long,
                                                                                              min_periods=1).mean().shift(
            1)
        df[prefix + '_RollingFirstHalfGoalsConceded_Mean'] = df['FirstHalfGoalsConceded'].rolling(window=window_long,
                                                                                                  min_periods=1).mean().shift(
            1)
        df[prefix + '_RollingFirstHalfGoalsScored_Std'] = df['FirstHalfGoalsScored'].rolling(window=window_long,
                                                                                             min_periods=1).std().shift(
            1)
        df[prefix + '_RollingFirstHalfGoalsConceded_Std'] = df['FirstHalfGoalsConceded'].rolling(window=window_long,
                                                                                                 min_periods=1).std().shift(
            1)
        df[prefix + '_RollingFirstHalfGoalsScored_Mean_Short'] = df['FirstHalfGoalsScored'].rolling(window=window_short,
                                                                                                    min_periods=1).mean().shift(
            1)
        df[prefix + '_Momentum_FirstHalfGoalsScored'] = df[prefix + '_RollingFirstHalfGoalsScored_Mean_Short'] - df[
            prefix + '_RollingFirstHalfGoalsScored_Mean']

        # Function to compute trend slope using simple linear regression
        def compute_slope(x):
            if len(x) < 2:
                return np.nan
            xs = np.arange(len(x))
            return np.polyfit(xs, x, 1)[0]

        df[prefix + '_Trend_Slope_GoalsScored'] = df['GoalsScored'].rolling(window=window_long, min_periods=2).apply(
            compute_slope, raw=True).shift(1)
        df[prefix + '_Trend_Slope_FirstHalfGoalsScored'] = df['FirstHalfGoalsScored'].rolling(window=window_long,
                                                                                              min_periods=2).apply(
            compute_slope, raw=True).shift(1)

        computed_cols = [
            prefix + '_RollingGoalsScored_Mean',
            prefix + '_RollingGoalsConceded_Mean',
            prefix + '_RollingGoalsScored_Std',
            prefix + '_RollingGoalsConceded_Std',
            prefix + '_RollingGoalsScored_Mean_Short',
            prefix + '_Momentum_GoalsScored',
            prefix + '_Trend_Slope_GoalsScored',
            prefix + '_RollingFirstHalfGoalsScored_Mean',
            prefix + '_RollingFirstHalfGoalsConceded_Mean',
            prefix + '_RollingFirstHalfGoalsScored_Std',
            prefix + '_RollingFirstHalfGoalsConceded_Std',
            prefix + '_RollingFirstHalfGoalsScored_Mean_Short',
            prefix + '_Momentum_FirstHalfGoalsScored',
            prefix + '_Trend_Slope_FirstHalfGoalsScored'
        ]
        return df[computed_cols]

    def add_rolling_features_split(group):
        """
        For each team (grouped by country, season, and team), compute:
          - Overall rolling features (all matches)
          - Home-only rolling features (for matches where is_home == 1)
          - Away-only rolling features (for matches where is_home == 0)
        """
        group = group.sort_values(by='date').reset_index(drop=True)
        overall_features = compute_rolling(group, 'Overall')
        group = pd.concat([group, overall_features], axis=1)

        home_mask = group['is_home'] == 1
        if home_mask.sum() > 0:
            group_home = group.loc[home_mask].copy()
            home_features = compute_rolling(group_home, 'Home')
            for col in home_features.columns:
                group.loc[home_mask, col] = home_features[col].values

        away_mask = group['is_home'] == 0
        if away_mask.sum() > 0:
            group_away = group.loc[away_mask].copy()
            away_features = compute_rolling(group_away, 'Away')
            for col in away_features.columns:
                group.loc[away_mask, col] = away_features[col].values

            # ----- Additional Goal Threshold Percentages -----
            # Compute cumulative (season) and rolling (last 5 games) percentages for goals scored over thresholds.
            # ----- Additional Goal Threshold Percentages -----
            # For each threshold, compute season cumulative and rolling percentages.
        for threshold in [1.5, 2.5, 3.5]:
            # ----- Overall (Team as a Whole) -----
            overall_season_col = f'Overall_Percent_Over_{threshold}'
            overall_rolling_col = f'Overall_Rolling5_Percent_Over_{threshold}'
            indicator_overall = group['GoalsScored'].gt(threshold)
            # Using shift(1) to exclude the current match:
            group[overall_season_col] = indicator_overall.shift(1).expanding(min_periods=1).mean()
            group[overall_rolling_col] = indicator_overall.shift(1).rolling(window=5, min_periods=1).mean()

            # ----- Home Matches Only -----
            season_col_home = f'Home_Percent_Over_{threshold}'
            rolling_col_home = f'Home_Rolling5_Percent_Over_{threshold}'
            if home_mask.sum() > 0:
                indicator_home = group.loc[home_mask, 'GoalsScored'].gt(threshold)
                # Compute on the home subset and assign back to the group
                group.loc[home_mask, season_col_home] = indicator_home.shift(1).expanding(min_periods=1).mean()
                group.loc[home_mask, rolling_col_home] = indicator_home.shift(1).rolling(window=5, min_periods=1).mean()

            # ----- Away Matches Only -----
            season_col_away = f'Away_Percent_Over_{threshold}'
            rolling_col_away = f'Away_Rolling5_Percent_Over_{threshold}'
            if away_mask.sum() > 0:
                indicator_away = group.loc[away_mask, 'GoalsScored'].gt(threshold)
                group.loc[away_mask, season_col_away] = indicator_away.shift(1).expanding(min_periods=1).mean()
                group.loc[away_mask, rolling_col_away] = indicator_away.shift(1).rolling(window=5, min_periods=1).mean()

            # ----- Team-specific Match Outcome Percentages -----
            # These features capture the percentage of matches in which the team has scored over a given goal threshold.
        for threshold in [0.5, 1.5, 2.5, 3.5]:
            # Overall (all matches)
            overall_col = f'TeamPct_Over_{threshold}'
            indicator_overall = group['GoalsScored'].gt(threshold)
            # Use shift(1) to exclude the current match from its own calculation.
            group[overall_col] = indicator_overall.shift(1).expanding(min_periods=1).mean()

            # Home matches only
            home_col = f'Home_TeamPct_Over_{threshold}'
            if home_mask.sum() > 0:
                indicator_home = group.loc[home_mask, 'GoalsScored'].gt(threshold)
                group.loc[home_mask, home_col] = indicator_home.shift(1).expanding(min_periods=1).mean()

            # Away matches only
            away_col = f'Away_TeamPct_Over_{threshold}'
            if away_mask.sum() > 0:
                indicator_away = group.loc[away_mask, 'GoalsScored'].gt(threshold)
                group.loc[away_mask, away_col] = indicator_away.shift(1).expanding(min_periods=1).mean()

        return group

    # Apply rolling feature engineering group-wise
    team_df = team_df.groupby(['country', 'season', 'Team'], group_keys=False) \
        .apply(add_rolling_features_split) \
        .reset_index(drop=True)

    # -----------------------------
    # Process Home-Team Features
    # -----------------------------
    home_subset = team_df[team_df['is_home'] == 1].copy()
    home_subset = home_subset.drop(columns=['Opponent'])
    home_subset.rename(columns={'Team': 'home_team'}, inplace=True)
    home_key_cols = ['country', 'season', 'date', 'home_team', 'is_home']
    home_feature_cols = [col for col in home_subset.columns
                         if col not in home_key_cols and (col.startswith("Overall_") or col.startswith("Home_"))]
    home_features = home_subset[home_key_cols + home_feature_cols].copy()
    home_features.rename(columns={col: "home_" + col for col in home_feature_cols}, inplace=True)

    # -----------------------------
    # Process Away-Team Features
    # -----------------------------
    away_subset = team_df[team_df['is_home'] == 0].copy()
    away_subset = away_subset.drop(columns=['Opponent'])
    away_subset.rename(columns={'Team': 'away_team'}, inplace=True)
    away_key_cols = ['country', 'season', 'date', 'away_team', 'is_home']
    away_feature_cols = [col for col in away_subset.columns
                         if col not in away_key_cols and (col.startswith("Overall_") or col.startswith("Away_"))]
    away_features = away_subset[away_key_cols + away_feature_cols].copy()
    away_features.rename(columns={col: "away_" + col for col in away_feature_cols}, inplace=True)

    # -----------------------------
    # Merge Back into the Match-Level DataFrame
    # -----------------------------
    match_df = data.copy()
    match_df = match_df.merge(home_features, on=['country', 'season', 'date', 'home_team'], how='left')
    match_df = match_df.merge(away_features, on=['country', 'season', 'date', 'away_team'], how='left')

    # Clean up and finalise the match-level DataFrame
    match_df.dropna(inplace=True)
    match_df['total_goals'] = match_df['home_goals_ft'] + match_df['away_goals_ft']
    match_df['target'] = match_df['total_goals'].apply(lambda x: 1 if x > 2.5 else 0)

    return match_df





def extract_identifiers(directory: str) -> tuple:
    """
    Extracts identifiers from filenames in the specified directory.

    The filenames should follow the pattern:
    model_metrics_('Identifier',)_YYYYMMDD_HHMMSS.csv

    Args:
        directory (str): The directory to search for matching files.

    Returns:
        tuple: A tuple containing the extracted identifiers.
    """
    # Compile a regular expression to capture the text inside the quotes.
    pattern = re.compile(r"model_metrics_\('([^']+)',\)_\d{8}_\d{6}\.csv")

    # Use glob to find all files starting with 'model_metrics_' and ending with '.csv'
    files = glob.glob(directory + r"\model_metrics_*.csv")

    # Extract the identifier from each file if it matches the pattern
    identifiers = tuple(match.group(1) for file in files if (match := pattern.search(file)) is not None)

    return identifiers

#if __name__ == "__main__":



In [2]:
start = time.time()

#matches = prepare_data(r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\cgmbetdatabase_top_5_2020+.csv")
matches = prepare_data(r"C:\Users\leere\PycharmProjects\Football_ML3\engineered_master_data_ALL_2014.csv")

# Process each league separately
leagues = matches[['country']].drop_duplicates().apply(tuple, axis=1)

directory = r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\Goals_v3"
league_tuple = extract_identifiers(directory)



Successfully read the file with encoding: utf-8


  .apply(add_rolling_features_split) \


In [21]:
matches.drop_duplicates(inplace=True)
matches

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_Overall_Percent_Over_3.5,away_Overall_Rolling5_Percent_Over_3.5,away_Away_Percent_Over_3.5,away_Away_Rolling5_Percent_Over_3.5,away_Away_TeamPct_Over_0.5,away_Away_TeamPct_Over_1.5,away_Away_TeamPct_Over_2.5,away_Away_TeamPct_Over_3.5,total_goals,target
295,Aus2,15,2014-09-12,1930.0,9,LASK,FAC Wien,5,1,1,...,0.00,0.00,0.0,0.0,0.5,0.000000,0.000000,0.0,6,1
300,Ger3,15,2014-09-12,1800.0,9,Wiesbaden,Erfurt,3,1,1,...,0.00,0.00,0.0,0.0,1.0,0.500000,0.000000,0.0,4,1
313,Aus2,15,2014-09-12,1730.0,9,Mattersburg,Liefering,3,6,3,...,0.25,0.25,0.0,0.0,0.5,0.500000,0.500000,0.0,9,1
322,Aus2,15,2014-09-12,1730.0,9,Lustenau,KSV 1919,1,3,0,...,0.00,0.00,0.0,0.0,1.0,1.000000,0.500000,0.0,4,1
374,Ger3,15,2014-09-13,1300.0,9,Osnabruck,Kiel,2,1,1,...,0.00,0.00,0.0,0.0,1.0,1.000000,0.000000,0.0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159345,Ita1,20,2019-10-30,2000.0,10,Udinese,Roma,0,4,0,...,0.00,0.00,0.0,0.0,0.5,0.000000,0.000000,0.0,4,1
159346,Ita1,20,2019-10-30,2000.0,10,Sassuolo,Fiorentina,1,2,1,...,0.00,0.00,0.0,0.0,0.5,0.500000,0.500000,0.0,3,1
159347,Ita1,20,2019-10-30,2000.0,10,Sampdoria,Lecce,1,1,0,...,0.00,0.00,0.0,0.0,1.0,0.666667,0.333333,0.0,2,0
159348,Ita1,20,2019-10-30,2000.0,10,Juventus,Genoa,2,1,1,...,0.00,0.00,0.0,0.0,0.5,0.000000,0.000000,0.0,3,1


In [23]:
matches_filtered = matches[(matches['country'] == "Hun1")]
matches_filtered.drop_duplicates()

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_Overall_Percent_Over_3.5,away_Overall_Rolling5_Percent_Over_3.5,away_Away_Percent_Over_3.5,away_Away_Rolling5_Percent_Over_3.5,away_Away_TeamPct_Over_0.5,away_Away_TeamPct_Over_1.5,away_Away_TeamPct_Over_2.5,away_Away_TeamPct_Over_3.5,total_goals,target
941,Hun1,15,2014-09-27,1930.0,9,Lombard Papa,Ujpest,0,0,0,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0,0
1103,Hun1,15,2014-10-03,1800.0,10,Puskas Akademia,Gyori ETO,2,1,1,...,0.250000,0.25,0.000000,0.0,0.500000,0.500000,0.500000,0.000000,3,1
1217,Hun1,15,2014-10-04,1500.0,10,Debreceni,Honved,4,0,2,...,0.000000,0.00,0.000000,0.0,0.500000,0.000000,0.000000,0.000000,4,1
1218,Hun1,15,2014-10-04,1900.0,10,Ujpest,Dunaujvaros,3,0,1,...,0.000000,0.00,0.000000,0.0,0.400000,0.400000,0.200000,0.000000,3,1
1220,Hun1,15,2014-10-04,1300.0,10,Diosgyori,Ferencvarosi,2,1,1,...,0.000000,0.00,0.000000,0.0,1.000000,0.000000,0.000000,0.000000,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155446,Hun1,19,2019-05-19,1600.0,33,Diosgyori,Ujpest,3,0,1,...,0.076923,0.00,0.076923,0.0,0.615385,0.384615,0.076923,0.076923,3,1
155447,Hun1,19,2019-05-19,1600.0,33,Debreceni,Paksi,4,1,3,...,0.035714,0.00,0.000000,0.0,0.500000,0.142857,0.000000,0.000000,5,1
159122,Hun1,20,2019-10-26,1600.0,10,Kisvarda Master Good,Puskas Akademia,0,1,0,...,0.250000,0.25,0.000000,0.0,1.000000,0.500000,0.000000,0.000000,1,0
159124,Hun1,20,2019-10-26,1600.0,10,Honved,Kaposvari,2,0,2,...,0.000000,0.00,0.000000,0.0,0.666667,0.000000,0.000000,0.000000,2,0


In [25]:
matches_filtered['target'].sum()

np.int64(358)

In [None]:
for league in leagues:
    print(league)
    matches_filtered = matches[(matches['country'] == league[0])]
    if league[0] not in league_tuple:
        fl.run_models(matches_filtered, features, league, min_samples=100)

end = time.time()

elapsed_time = end - start  # Calculate elapsed time in seconds

# Print the elapsed time in seconds, minutes, and hours:
print("Elapsed time in seconds: {:.2f}".format(elapsed_time))
print("Elapsed time in minutes: {:.2f}".format(elapsed_time / 60))
print("Elapsed time in hours:   {:.2f}".format(elapsed_time / 3600))