In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

In [2]:
# Set end_period as a date object.
end_period = pd.to_datetime("2025-05-25").date()

In [3]:
# Path to the .xls file
file_path = r"C:\Users\leere\OneDrive\Desktop\RAW DATA\ml_goals.xls"

# Load the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Display the first few rows of the DataFrame
df.head()


*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'


Unnamed: 0,country,sezonul,datameci,orameci,etapa,txtechipa1,txtechipa2,scor1,scor2,scorp1,...,yellowa2,ballph,ballph1,ballph2,ballpa,ballpa1,ballpa2,stare,codechipa1,codechipa2
0,Ita1,25,2024-02-06,1945,14,Fiorentina,Inter,3,0,0,...,0,28,35,21,72,65,79,J,2008,2002
1,Mex1,25,2024-07-05,2345,1,Puebla,Santos Laguna,1,0,0,...,2,40,39,41,60,61,59,J,41017,41008
2,Mex1,25,2024-07-06,200,1,Queretaro,Tijuana de Caliente,1,2,0,...,0,37,33,41,63,67,59,J,41011,41006
3,Mex1,25,2024-07-06,410,1,Juarez,Atlas,2,2,1,...,0,55,66,44,45,34,56,J,41020,41002
4,Mex1,25,2024-07-07,0,1,San Luis,Club America,2,1,1,...,1,37,41,33,63,59,67,J,41019,41012


In [4]:
column_dict = {
    "country": "country",
    "league": "league",
    "sezonul": "season",
    "datameci": "date",
    "orameci": "ko_time",
    "etapa": "round",
    "txtechipa1": "home_team",
    "txtechipa2": "away_team",
    "scor1": "home_goals_ft",
    "scor2": "away_goals_ft",
    "scorp1": "home_goals_ht",
    "scorp2": "away_goals_ht",
    "place1": "home_team_place_total",
    "place1a": "home_team_place_home",
    "place2": "away_team_place_total",
    "place2d": "away_team_place_away",
    "cotaa": "home_odds",
    "cotae": "draw_odds",
    "cotad": "away_odds",
    # "cotao0": "",
    # "cotao1": "",
    "cotao": "over_25_odds",
    # "cotao3": "",
    # "cotao4": "",
    # "cotau0": "",
    # "cotau1": "",
    "cotau": "under_25_odds",
    # "cotau3": "",
    # "cotau4": "",
    # "gg": "",
    # "ng": "",
    "elohomeo": "elo_home",
    "eloawayo": "elo_away",
    "formah": "form_home",
    "formaa": "form_away",
    "suth": "shots_home",
    "suth1": "shots_home_1h",
    "suth2": "shots_home_2h",
    "suta": "shots_away",
    "suta1": "shots_away_1h",
    "suta2": "shots_away_2h",
    "sutht": "shots_on_target_home",
    "sutht1": "shots_on_target_home_1h",
    "sutht2": "shots_on_target_home_2h",
    "sutat": "shots_on_target_away",
    "sutat1": "shots_on_target_away_1h",
    "sutat2": "shots_on_target_away_2h",
    "corh": "corners_home",
    "corh1": "corners_home_1h",
    "corh2": "corners_home_2h",
    "cora": "corners_away",
    "cora1": "corners_away_1h",
    "cora2": "corners_away_2h",
    "foulsh": "fouls_home",
    "foulsh1": "fouls_home_1h",
    "foulsh2": "fouls_home_2h",
    "foulsa": "fouls_away",
    "foulsa1": "fouls_away_1h",
    "foulsa2": "fouls_away_2h",
    "yellowh": "yellow_cards_home",
    "yellowh1": "yellow_cards_home_1h",
    "yellowh2": "yellow_cards_home_2h",
    "yellowa": "yellow_cards_away",
    "yellowa1": "yellow_cards_away_1h",
    "yellowa2": "yellow_cards_away_2h",
    "ballph": "possession_home",
    "ballph1": "possession_home_1h",
    "ballph2": "possession_home_2h",
    "ballpa": "possession_away",
    "ballpa1": "possession_away_1h",
    "ballpa2": "possession_away_2h",
    "gsh": "goals_scored_total_home",
    "gch": "goals_conceded_total_home",
    "gsa": "goals_scored_total_away",
    "gca": "goals_conceded_total_away",
    # "stare": "",
    # "codechipa1": "",
    # "codechipa2": ""
}

df = df.rename(columns=column_dict).filter(items=column_dict.values())
data = df.copy()
#data

In [5]:
# Convert 'date' column to datetime object
data['date'] = pd.to_datetime(data['date'], format="%d/%m/%Y", errors='coerce')

# Order by date
data = data.sort_values(by='date')

# Filter out future dates (ensure data does not go beyond today)
today = datetime.today().date()
data = data[data['date'].dt.date <= end_period]

# Create a mask for matches that have been played (i.e. date is less than today)
played_mask = data['date'].dt.date < today

# Calculate home points for played matches only.
data.loc[played_mask, 'points_home'] = np.where(
    data.loc[played_mask, 'home_goals_ft'] > data.loc[played_mask, 'away_goals_ft'], 3,
    np.where(data.loc[played_mask, 'home_goals_ft'] == data.loc[played_mask, 'away_goals_ft'], 1, 0)
)

# Calculate away points for played matches only.
data.loc[played_mask, 'points_away'] = np.where(
    data.loc[played_mask, 'away_goals_ft'] > data.loc[played_mask, 'home_goals_ft'], 3,
    np.where(data.loc[played_mask, 'away_goals_ft'] == data.loc[played_mask, 'home_goals_ft'], 1, 0)
)

In [6]:
# # Assign points based on match results
# data["points_home"] = data.apply(lambda row: 3 if row["home_goals_ft"] > row["away_goals_ft"]
# else (1 if row["home_goals_ft"] == row["away_goals_ft"] else 0), axis=1)
#
# data["points_away"] = data.apply(lambda row: 3 if row["away_goals_ft"] > row["home_goals_ft"]
# else (1 if row["away_goals_ft"] == row["home_goals_ft"] else 0), axis=1)

# =============================================================================
# 1. Data Preparation: Build Team-Level Data (Home & Away)
# =============================================================================
# Prepare home records.
home_df = data[['country', 'season', 'date', 'home_team', 'away_team',
                'home_goals_ft', 'away_goals_ft', 'home_goals_ht', 'away_goals_ht',
                'shots_home', 'shots_home_1h', 'shots_home_2h',
                'shots_on_target_home', 'shots_on_target_home_1h', 'shots_on_target_home_2h',
                'corners_home', 'corners_home_1h', 'corners_home_2h']].copy()
home_df.rename(columns={
    'home_team': 'Team',
    'away_team': 'Opponent',
    'home_goals_ft': 'GoalsScored',
    'away_goals_ft': 'GoalsConceded',
    'home_goals_ht': 'FirstHalfGoalsScored',
    'away_goals_ht': 'FirstHalfGoalsConceded',
    'shots_home': 'Shots',
    'shots_home_1h': 'Shots_1h',
    'shots_home_2h': 'Shots_2h',
    'shots_on_target_home': 'ShotsOnTarget',
    'shots_on_target_home_1h': 'ShotsOnTarget_1h',
    'shots_on_target_home_2h': 'ShotsOnTarget_2h',
    'corners_home': 'Corners',
    'corners_home_1h': 'Corners_1h',
    'corners_home_2h': 'Corners_2h'
}, inplace=True)
home_df['is_home'] = 1

# Prepare away records.
away_df = data[['country', 'season', 'date', 'away_team', 'home_team',
                'away_goals_ft', 'home_goals_ft', 'away_goals_ht', 'home_goals_ht',
                'shots_away', 'shots_away_1h', 'shots_away_2h',
                'shots_on_target_away', 'shots_on_target_away_1h', 'shots_on_target_away_2h',
                'corners_away', 'corners_away_1h', 'corners_away_2h']].copy()
away_df.rename(columns={
    'away_team': 'Team',
    'home_team': 'Opponent',
    'away_goals_ft': 'GoalsScored',
    'home_goals_ft': 'GoalsConceded',
    'away_goals_ht': 'FirstHalfGoalsScored',
    'home_goals_ht': 'FirstHalfGoalsConceded',
    'shots_away': 'Shots',
    'shots_away_1h': 'Shots_1h',
    'shots_away_2h': 'Shots_2h',
    'shots_on_target_away': 'ShotsOnTarget',
    'shots_on_target_away_1h': 'ShotsOnTarget_1h',
    'shots_on_target_away_2h': 'ShotsOnTarget_2h',
    'corners_away': 'Corners',
    'corners_away_1h': 'Corners_1h',
    'corners_away_2h': 'Corners_2h'
}, inplace=True)
away_df['is_home'] = 0

# Combine both.
team_df = pd.concat([home_df, away_df], ignore_index=True)
team_df.sort_values(by=['country', 'season', 'Team', 'date'], inplace=True)

# Define rolling window sizes.
window_long = 5   # for long-term trends
window_short = 3  # for short-term momentum

# =============================================================================
# 2. Rolling Feature Computation Functions
# =============================================================================
def compute_slope(x):
    """Compute slope using simple linear regression."""
    if len(x) < 2:
        return np.nan
    xs = np.arange(len(x))
    return np.polyfit(xs, x, 1)[0]

def compute_rolling_features_metric(df_sub, full_col, first_half_col, prefix):
    """
    Compute rolling features for a given metric.
    Returns a DataFrame of new columns.
    """
    new_cols = {}
    # Full-match features.
    new_cols[f'{prefix}_Rolling_{full_col}_Mean'] = df_sub[full_col].rolling(window=window_long, min_periods=1).mean().shift(1)
    new_cols[f'{prefix}_Rolling_{full_col}_Std']  = df_sub[full_col].rolling(window=window_long, min_periods=1).std().shift(1)
    new_cols[f'{prefix}_Rolling_{full_col}_Mean_Short'] = df_sub[full_col].rolling(window=window_short, min_periods=1).mean().shift(1)
    new_cols[f'{prefix}_Momentum_{full_col}'] = new_cols[f'{prefix}_Rolling_{full_col}_Mean_Short'] - new_cols[f'{prefix}_Rolling_{full_col}_Mean']
    new_cols[f'{prefix}_Trend_Slope_{full_col}'] = df_sub[full_col].rolling(window=window_long, min_periods=2).apply(compute_slope, raw=True).shift(1)
    # First-half features.
    new_cols[f'{prefix}_Rolling_{first_half_col}_Mean'] = df_sub[first_half_col].rolling(window=window_long, min_periods=1).mean().shift(1)
    new_cols[f'{prefix}_Rolling_{first_half_col}_Std']  = df_sub[first_half_col].rolling(window=window_long, min_periods=1).std().shift(1)
    new_cols[f'{prefix}_Rolling_{first_half_col}_Mean_Short'] = df_sub[first_half_col].rolling(window=window_short, min_periods=1).mean().shift(1)
    new_cols[f'{prefix}_Momentum_{first_half_col}'] = new_cols[f'{prefix}_Rolling_{first_half_col}_Mean_Short'] - new_cols[f'{prefix}_Rolling_{first_half_col}_Mean']
    new_cols[f'{prefix}_Trend_Slope_{first_half_col}'] = df_sub[first_half_col].rolling(window=window_long, min_periods=2).apply(compute_slope, raw=True).shift(1)
    return pd.DataFrame(new_cols, index=df_sub.index)

def add_rolling_features_split(group):
    """Compute overall, home-, and away-specific rolling features plus outcome percentages."""
    group = group.sort_values(by='date').reset_index(drop=True)

    # Overall features.
    overall_features = pd.concat([
        compute_rolling_features_metric(group, 'GoalsScored', 'FirstHalfGoalsScored', 'Overall'),
        compute_rolling_features_metric(group, 'Shots', 'Shots_1h', 'Overall'),
        compute_rolling_features_metric(group, 'Corners', 'Corners_1h', 'Overall'),
        compute_rolling_features_metric(group, 'ShotsOnTarget', 'ShotsOnTarget_1h', 'Overall')
    ], axis=1)
    group = pd.concat([group, overall_features], axis=1)

    home_mask = group['is_home'] == 1
    away_mask = group['is_home'] == 0

    # Home-specific.
    if home_mask.any():
        home_feats = pd.concat([
            compute_rolling_features_metric(group.loc[home_mask], 'GoalsScored', 'FirstHalfGoalsScored', 'Home'),
            compute_rolling_features_metric(group.loc[home_mask], 'Shots', 'Shots_1h', 'Home'),
            compute_rolling_features_metric(group.loc[home_mask], 'Corners', 'Corners_1h', 'Home'),
            compute_rolling_features_metric(group.loc[home_mask], 'ShotsOnTarget', 'ShotsOnTarget_1h', 'Home')
        ], axis=1)
        group.loc[home_mask, home_feats.columns] = home_feats

    # Away-specific.
    if away_mask.any():
        away_feats = pd.concat([
            compute_rolling_features_metric(group.loc[away_mask], 'GoalsScored', 'FirstHalfGoalsScored', 'Away'),
            compute_rolling_features_metric(group.loc[away_mask], 'Shots', 'Shots_1h', 'Away'),
            compute_rolling_features_metric(group.loc[away_mask], 'Corners', 'Corners_1h', 'Away'),
            compute_rolling_features_metric(group.loc[away_mask], 'ShotsOnTarget', 'ShotsOnTarget_1h', 'Away')
        ], axis=1)
        group.loc[away_mask, away_feats.columns] = away_feats

    # Additional outcome percentages for goals.
    thresh_dict = {}
    for thresh in [1.5, 2.5, 3.5]:
        thresh_dict[f'Overall_Percent_Over_{thresh}'] = group['GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
        thresh_dict[f'Overall_Rolling5_Percent_Over_{thresh}'] = group['GoalsScored'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
        if home_mask.any():
            thresh_dict[f'Home_Percent_Over_{thresh}'] = group.loc[home_mask, 'GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
            thresh_dict[f'Home_Rolling5_Percent_Over_{thresh}'] = group.loc[home_mask, 'GoalsScored'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
        if away_mask.any():
            thresh_dict[f'Away_Percent_Over_{thresh}'] = group.loc[away_mask, 'GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
            thresh_dict[f'Away_Rolling5_Percent_Over_{thresh}'] = group.loc[away_mask, 'GoalsScored'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
    group = pd.concat([group, pd.DataFrame(thresh_dict, index=group.index)], axis=1)

    # Outcome percentages for goals.
    outcome_dict = {}
    for thresh in [0.5, 1.5, 2.5, 3.5]:
        outcome_dict[f'TeamPct_Over_{thresh}'] = group['GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
        if home_mask.any():
            outcome_dict[f'Home_TeamPct_Over_{thresh}'] = group.loc[home_mask, 'GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
        if away_mask.any():
            outcome_dict[f'Away_TeamPct_Over_{thresh}'] = group.loc[away_mask, 'GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
    group = pd.concat([group, pd.DataFrame(outcome_dict, index=group.index)], axis=1)

    # Outcome percentages for corners.
    corners_thresh = [3.5, 4.5, 5.5, 6.5]
    corners_dict = {}
    for thresh in corners_thresh:
        corners_dict[f'CornersPct_Over_{thresh}'] = group['Corners'].gt(thresh).shift(1).expanding(min_periods=1).mean()
        corners_dict[f'CornersRolling5Pct_Over_{thresh}'] = group['Corners'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
        if home_mask.any():
            corners_dict[f'Home_CornersPct_Over_{thresh}'] = group.loc[home_mask, 'Corners'].gt(thresh).shift(1).expanding(min_periods=1).mean()
            corners_dict[f'Home_CornersRolling5Pct_Over_{thresh}'] = group.loc[home_mask, 'Corners'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
        if away_mask.any():
            corners_dict[f'Away_CornersPct_Over_{thresh}'] = group.loc[away_mask, 'Corners'].gt(thresh).shift(1).expanding(min_periods=1).mean()
            corners_dict[f'Away_CornersRolling5Pct_Over_{thresh}'] = group.loc[away_mask, 'Corners'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
    group = pd.concat([group, pd.DataFrame(corners_dict, index=group.index)], axis=1)

    return group

# Apply group-wise computations.
team_df = team_df.groupby(['country', 'season', 'Team'], group_keys=False).apply(add_rolling_features_split).reset_index(drop=True)
team_df = team_df.copy()  # ensure defragmentation

# =============================================================================
# 3. Compute Team-Level Corners Outcome Features (from Match Data)
# =============================================================================
# Build a match-level DataFrame for corners outcomes.
match_df = data.copy()
match_df['Total_Corners'] = match_df['corners_home'] + match_df['corners_away']
match_df.sort_values(by=['country', 'season', 'date'], inplace=True)

# Create a team perspective by combining home and away records.
home_matches = match_df[['country', 'season', 'date', 'home_team', 'Total_Corners']].copy()
home_matches.rename(columns={'home_team': 'Team'}, inplace=True)
away_matches = match_df[['country', 'season', 'date', 'away_team', 'Total_Corners']].copy()
away_matches.rename(columns={'away_team': 'Team'}, inplace=True)
team_corners_matches = pd.concat([home_matches, away_matches], ignore_index=True)
team_corners_matches.sort_values(by=['country', 'season', 'Team', 'date'], inplace=True)

# For thresholds 9.5, 10.5, and 11.5, compute season-level and rolling percentages.
for thr in [9.5, 10.5, 11.5]:
    indicator = f'Over_{thr}'
    team_corners_matches[indicator] = (team_corners_matches['Total_Corners'] > thr).astype(int)
    team_corners_matches[f'SeasonPct_{indicator}'] = team_corners_matches.groupby(
        ['country', 'season', 'Team']
    )[indicator].transform(lambda x: x.shift(1).expanding(min_periods=1).mean())
    team_corners_matches[f'Rolling5Pct_{indicator}'] = team_corners_matches.groupby(
        ['country', 'season', 'Team']
    )[indicator].transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())

# Select only the keys and outcome columns for merging.
cols_to_merge = ['country', 'season', 'date', 'Team',
                 'SeasonPct_Over_9.5', 'Rolling5Pct_Over_9.5',
                 'SeasonPct_Over_10.5', 'Rolling5Pct_Over_10.5',
                 'SeasonPct_Over_11.5', 'Rolling5Pct_Over_11.5']

# Merge the corners outcome features into team_df.
team_df = team_df.merge(team_corners_matches[cols_to_merge],
                        on=['country', 'season', 'date', 'Team'],
                        how='left')

# =============================================================================
# 4. Process Home and Away Features for Match-Level Merging
# =============================================================================
# -- Home-Team Process --
home_subset = team_df[team_df['is_home'] == 1].copy()
home_subset.drop(columns=['Opponent'], inplace=True)
home_subset.rename(columns={'Team': 'home_team'}, inplace=True)
home_key = ['country', 'season', 'date', 'home_team', 'is_home']
# Include features starting with Overall_, Home_, SeasonPct_Over_, or Rolling5Pct_Over_
home_feats = [col for col in home_subset.columns if col not in home_key and
              (col.startswith("Overall_") or col.startswith("Home_") or
               col.startswith("SeasonPct_Over_") or col.startswith("Rolling5Pct_Over_"))]
home_features = home_subset[home_key + home_feats].copy()
def clean_home_name(col):
    return "home_" + (col[len("Home_"):] if col.startswith("Home_") else col)
home_features.rename(columns={col: clean_home_name(col) for col in home_feats}, inplace=True)

# -- Away-Team Process --
away_subset = team_df[team_df['is_home'] == 0].copy()
away_subset.drop(columns=['Opponent'], inplace=True)
away_subset.rename(columns={'Team': 'away_team'}, inplace=True)
away_key = ['country', 'season', 'date', 'away_team', 'is_home']
away_feats = [col for col in away_subset.columns if col not in away_key and
              (col.startswith("Overall_") or col.startswith("Away_") or
               col.startswith("SeasonPct_Over_") or col.startswith("Rolling5Pct_Over_"))]
away_features = away_subset[away_key + away_feats].copy()
def clean_away_name(col):
    return "away_" + (col[len("Away_"):] if col.startswith("Away_") else col)
away_features.rename(columns={col: clean_away_name(col) for col in away_feats}, inplace=True)

# =============================================================================
# 5. Merge Home and Away Features into the Match-Level DataFrame
# =============================================================================
# Start with the original match data.
match_merge_df = data.copy()
# Merge home features.
match_merge_df = match_merge_df.merge(home_features, on=['country', 'season', 'date', 'home_team'], how='left')
# Merge away features.
match_merge_df = match_merge_df.merge(away_features, on=['country', 'season', 'date', 'away_team'], how='left')

# (Optional) Display a sample.
#print(match_merge_df.head())


  team_df = team_df.groupby(['country', 'season', 'Team'], group_keys=False).apply(add_rolling_features_split).reset_index(drop=True)


In [7]:
# match_merge_df_filter = match_merge_df[match_merge_df['away_team'] == "Chelsea"]
# match_merge_df_filter

In [8]:
## -----------------------------
# 1. Process Home-Team Features (with clean naming)
# -----------------------------
home_subset = team_df[team_df['is_home'] == 1].copy()
home_subset = home_subset.drop(columns=['Opponent'])
home_subset.rename(columns={'Team': 'home_team'}, inplace=True)

# Key columns that remain unchanged
home_key_cols = ['country', 'season', 'date', 'home_team', 'is_home']

# Update the feature column selection to include the merged outcome columns.
home_feature_cols = [col for col in home_subset.columns
                     if col not in home_key_cols and
                     (col.startswith("Overall_") or
                      col.startswith("Home_") or
                      col.startswith("SeasonPct_Over_") or
                      col.startswith("Rolling5Pct_Over_"))]

# Create a DataFrame with key columns and desired features
home_features = home_subset[home_key_cols + home_feature_cols].copy()

# Function to clean column names by removing any existing "Home_" prefix
def clean_home_name(col):
    if col.startswith("Home_"):
        col = col[len("Home_"):]
    return "home_" + col

# Build a renaming dictionary for home features
rename_mapping_home = {col: clean_home_name(col) for col in home_feature_cols}
home_features.rename(columns=rename_mapping_home, inplace=True)


# -----------------------------
# 2. Process Away-Team Features (with clean naming)
# -----------------------------
away_subset = team_df[team_df['is_home'] == 0].copy()
away_subset = away_subset.drop(columns=['Opponent'])
away_subset.rename(columns={'Team': 'away_team'}, inplace=True)

# Key columns that remain unchanged
away_key_cols = ['country', 'season', 'date', 'away_team', 'is_home']

# Update the feature column selection to include the merged outcome columns.
away_feature_cols = [col for col in away_subset.columns
                     if col not in away_key_cols and
                     (col.startswith("Overall_") or
                      col.startswith("Away_") or
                      col.startswith("SeasonPct_Over_") or
                      col.startswith("Rolling5Pct_Over_"))]

# Create a DataFrame with key columns and desired features
away_features = away_subset[away_key_cols + away_feature_cols].copy()

# Function to clean column names by removing any existing "Away_" prefix
def clean_away_name(col):
    if col.startswith("Away_"):
        col = col[len("Away_"):]
    return "away_" + col

# Build a renaming dictionary for away features
rename_mapping_away = {col: clean_away_name(col) for col in away_feature_cols}
away_features.rename(columns=rename_mapping_away, inplace=True)


# -----------------------------
# 3. Merge Processed Home- and Away-Team Features Back into the Match-Level DataFrame
# -----------------------------
# Start with your original match-level data
match_df = data.copy()

# Merge home features on the common keys: country, season, date, and home_team.
match_df = match_df.merge(home_features, on=['country', 'season', 'date', 'home_team'], how='left')

#Merge away features on the common keys: country, season, date, and away_team.
match_df = match_df.merge(away_features, on=['country', 'season', 'date', 'away_team'], how='left')

# # match_df now contains cleanly named columns such as "home_Rolling_GoalsScored_Mean" along with your corners outcome features.
# print(match_df.head())


In [9]:
# team_filter2 = match_df[(match_df["home_team"]=="Chelsea") | (match_df["away_team"]=="Chelsea")]
# team_filter2

In [10]:
#match_df

In [11]:
features = [
    # 'Unnamed: 0',
    # 'country',
    # 'season',
    # 'date',
    # 'ko_time',
    'round',
    # 'home_team',
    # 'away_team',
    # 'home_goals_ft',
    # 'away_goals_ft',
    # 'home_goals_ht',
    # 'away_goals_ht',
    'home_team_place_total',
    'home_team_place_home',
    'away_team_place_total',
    'away_team_place_away',
    'home_odds',
    'draw_odds',
    'away_odds',
    'over_25_odds',
    'under_25_odds',
    'elo_home',
    'elo_away',
    'form_home',
    'form_away',
    # 'shots_home',
    # 'shots_home_1h',
    # 'shots_home_2h',
    # 'shots_away',
    # 'shots_away_1h',
    # 'shots_away_2h',
    # 'shots_on_target_home',
    # 'shots_on_target_home_1h',
    # 'shots_on_target_home_2h',
    # 'shots_on_target_away',
    # 'shots_on_target_away_1h',
    # 'shots_on_target_away_2h',
    # 'corners_home',
    # 'corners_home_1h',
    # 'corners_home_2h',
    # 'corners_away',
    # 'corners_away_1h',
    # 'corners_away_2h',
    # 'fouls_home',
    # 'fouls_home_1h',
    # 'fouls_home_2h',
    # 'fouls_away',
    # 'fouls_away_1h',
    # 'fouls_away_2h',
    # 'yellow_cards_home',
    # 'yellow_cards_home_1h',
    # 'yellow_cards_home_2h',
    # 'yellow_cards_away',
    # 'yellow_cards_away_1h',
    # 'yellow_cards_away_2h',
    # 'possession_home',
    # 'possession_home_1h',
    # 'possession_home_2h',
    # 'possession_away',
    # 'possession_away_1h',
    # 'possession_away_2h',
    # 'goals_scored_total_home',
    # 'goals_conceded_total_home',
    # 'goals_scored_total_away',
    # 'goals_conceded_total_away',
    # 'points_home',
    # 'points_away',
    # 'is_home_x',
    'home_Overall_Rolling_GoalsScored_Mean',
    'home_Overall_Rolling_GoalsScored_Std',
    'home_Overall_Rolling_GoalsScored_Mean_Short',
    'home_Overall_Momentum_GoalsScored',
    'home_Overall_Trend_Slope_GoalsScored',
    'home_Overall_Rolling_FirstHalfGoalsScored_Mean',
    'home_Overall_Rolling_FirstHalfGoalsScored_Std',
    'home_Overall_Rolling_FirstHalfGoalsScored_Mean_Short',
    'home_Overall_Momentum_FirstHalfGoalsScored',
    'home_Overall_Trend_Slope_FirstHalfGoalsScored',
    'home_Overall_Rolling_Shots_Mean',
    'home_Overall_Rolling_Shots_Std',
    'home_Overall_Rolling_Shots_Mean_Short',
    'home_Overall_Momentum_Shots',
    'home_Overall_Trend_Slope_Shots',
    'home_Overall_Rolling_Shots_1h_Mean',
    'home_Overall_Rolling_Shots_1h_Std',
    'home_Overall_Rolling_Shots_1h_Mean_Short',
    'home_Overall_Momentum_Shots_1h',
    'home_Overall_Trend_Slope_Shots_1h',
    'home_Overall_Rolling_Corners_Mean',
    'home_Overall_Rolling_Corners_Std',
    'home_Overall_Rolling_Corners_Mean_Short',
    'home_Overall_Momentum_Corners',
    'home_Overall_Trend_Slope_Corners',
    'home_Overall_Rolling_Corners_1h_Mean',
    'home_Overall_Rolling_Corners_1h_Std',
    'home_Overall_Rolling_Corners_1h_Mean_Short',
    'home_Overall_Momentum_Corners_1h',
    'home_Overall_Trend_Slope_Corners_1h',
    'home_Overall_Rolling_ShotsOnTarget_Mean',
    'home_Overall_Rolling_ShotsOnTarget_Std',
    'home_Overall_Rolling_ShotsOnTarget_Mean_Short',
    'home_Overall_Momentum_ShotsOnTarget',
    'home_Overall_Trend_Slope_ShotsOnTarget',
    'home_Overall_Rolling_ShotsOnTarget_1h_Mean',
    'home_Overall_Rolling_ShotsOnTarget_1h_Std',
    'home_Overall_Rolling_ShotsOnTarget_1h_Mean_Short',
    'home_Overall_Momentum_ShotsOnTarget_1h',
    'home_Overall_Trend_Slope_ShotsOnTarget_1h',
    'home_Rolling_GoalsScored_Mean',
    'home_Rolling_GoalsScored_Std',
    'home_Rolling_GoalsScored_Mean_Short',
    'home_Momentum_GoalsScored',
    'home_Trend_Slope_GoalsScored',
    'home_Rolling_FirstHalfGoalsScored_Mean',
    'home_Rolling_FirstHalfGoalsScored_Std',
    'home_Rolling_FirstHalfGoalsScored_Mean_Short',
    'home_Momentum_FirstHalfGoalsScored',
    'home_Trend_Slope_FirstHalfGoalsScored',
    'home_Rolling_Shots_Mean',
    'home_Rolling_Shots_Std',
    'home_Rolling_Shots_Mean_Short',
    'home_Momentum_Shots',
    'home_Trend_Slope_Shots',
    'home_Rolling_Shots_1h_Mean',
    'home_Rolling_Shots_1h_Std',
    'home_Rolling_Shots_1h_Mean_Short',
    'home_Momentum_Shots_1h',
    'home_Trend_Slope_Shots_1h',
    'home_Rolling_Corners_Mean',
    'home_Rolling_Corners_Std',
    'home_Rolling_Corners_Mean_Short',
    'home_Momentum_Corners',
    'home_Trend_Slope_Corners',
    'home_Rolling_Corners_1h_Mean',
    'home_Rolling_Corners_1h_Std',
    'home_Rolling_Corners_1h_Mean_Short',
    'home_Momentum_Corners_1h',
    'home_Trend_Slope_Corners_1h',
    'home_Rolling_ShotsOnTarget_Mean',
    'home_Rolling_ShotsOnTarget_Std',
    'home_Rolling_ShotsOnTarget_Mean_Short',
    'home_Momentum_ShotsOnTarget',
    'home_Trend_Slope_ShotsOnTarget',
    'home_Rolling_ShotsOnTarget_1h_Mean',
    'home_Rolling_ShotsOnTarget_1h_Std',
    'home_Rolling_ShotsOnTarget_1h_Mean_Short',
    'home_Momentum_ShotsOnTarget_1h',
    'home_Trend_Slope_ShotsOnTarget_1h',
    'home_Overall_Percent_Over_1.5',
    'home_Overall_Rolling5_Percent_Over_1.5',
    'home_Percent_Over_1.5',
    'home_Rolling5_Percent_Over_1.5',
    'home_Overall_Percent_Over_2.5',
    'home_Overall_Rolling5_Percent_Over_2.5',
    'home_Percent_Over_2.5',
    'home_Rolling5_Percent_Over_2.5',
    'home_Overall_Percent_Over_3.5',
    'home_Overall_Rolling5_Percent_Over_3.5',
    'home_Percent_Over_3.5',
    'home_Rolling5_Percent_Over_3.5',
    'home_TeamPct_Over_0.5',
    'home_TeamPct_Over_1.5',
    'home_TeamPct_Over_2.5',
    'home_TeamPct_Over_3.5',
    'home_CornersPct_Over_3.5',
    'home_CornersRolling5Pct_Over_3.5',
    'home_CornersPct_Over_4.5',
    'home_CornersRolling5Pct_Over_4.5',
    'home_CornersPct_Over_5.5',
    'home_CornersRolling5Pct_Over_5.5',
    'home_CornersPct_Over_6.5',
    'home_CornersRolling5Pct_Over_6.5',
    'home_SeasonPct_Over_9.5',
    'home_Rolling5Pct_Over_9.5',
    'home_SeasonPct_Over_10.5',
    'home_Rolling5Pct_Over_10.5',
    'home_SeasonPct_Over_11.5',
    'home_Rolling5Pct_Over_11.5',
    # 'is_home_y',
    'away_Overall_Rolling_GoalsScored_Mean',
    'away_Overall_Rolling_GoalsScored_Std',
    'away_Overall_Rolling_GoalsScored_Mean_Short',
    'away_Overall_Momentum_GoalsScored',
    'away_Overall_Trend_Slope_GoalsScored',
    'away_Overall_Rolling_FirstHalfGoalsScored_Mean',
    'away_Overall_Rolling_FirstHalfGoalsScored_Std',
    'away_Overall_Rolling_FirstHalfGoalsScored_Mean_Short',
    'away_Overall_Momentum_FirstHalfGoalsScored',
    'away_Overall_Trend_Slope_FirstHalfGoalsScored',
    'away_Overall_Rolling_Shots_Mean',
    'away_Overall_Rolling_Shots_Std',
    'away_Overall_Rolling_Shots_Mean_Short',
    'away_Overall_Momentum_Shots',
    'away_Overall_Trend_Slope_Shots',
    'away_Overall_Rolling_Shots_1h_Mean',
    'away_Overall_Rolling_Shots_1h_Std',
    'away_Overall_Rolling_Shots_1h_Mean_Short',
    'away_Overall_Momentum_Shots_1h',
    'away_Overall_Trend_Slope_Shots_1h',
    'away_Overall_Rolling_Corners_Mean',
    'away_Overall_Rolling_Corners_Std',
    'away_Overall_Rolling_Corners_Mean_Short',
    'away_Overall_Momentum_Corners',
    'away_Overall_Trend_Slope_Corners',
    'away_Overall_Rolling_Corners_1h_Mean',
    'away_Overall_Rolling_Corners_1h_Std',
    'away_Overall_Rolling_Corners_1h_Mean_Short',
    'away_Overall_Momentum_Corners_1h',
    'away_Overall_Trend_Slope_Corners_1h',
    'away_Overall_Rolling_ShotsOnTarget_Mean',
    'away_Overall_Rolling_ShotsOnTarget_Std',
    'away_Overall_Rolling_ShotsOnTarget_Mean_Short',
    'away_Overall_Momentum_ShotsOnTarget',
    'away_Overall_Trend_Slope_ShotsOnTarget',
    'away_Overall_Rolling_ShotsOnTarget_1h_Mean',
    'away_Overall_Rolling_ShotsOnTarget_1h_Std',
    'away_Overall_Rolling_ShotsOnTarget_1h_Mean_Short',
    'away_Overall_Momentum_ShotsOnTarget_1h',
    'away_Overall_Trend_Slope_ShotsOnTarget_1h',
    'away_Rolling_GoalsScored_Mean',
    'away_Rolling_GoalsScored_Std',
    'away_Rolling_GoalsScored_Mean_Short',
    'away_Momentum_GoalsScored',
    'away_Trend_Slope_GoalsScored',
    'away_Rolling_FirstHalfGoalsScored_Mean',
    'away_Rolling_FirstHalfGoalsScored_Std',
    'away_Rolling_FirstHalfGoalsScored_Mean_Short',
    'away_Momentum_FirstHalfGoalsScored',
    'away_Trend_Slope_FirstHalfGoalsScored',
    'away_Rolling_Shots_Mean',
    'away_Rolling_Shots_Std',
    'away_Rolling_Shots_Mean_Short',
    'away_Momentum_Shots',
    'away_Trend_Slope_Shots',
    'away_Rolling_Shots_1h_Mean',
    'away_Rolling_Shots_1h_Std',
    'away_Rolling_Shots_1h_Mean_Short',
    'away_Momentum_Shots_1h',
    'away_Trend_Slope_Shots_1h',
    'away_Rolling_Corners_Mean',
    'away_Rolling_Corners_Std',
    'away_Rolling_Corners_Mean_Short',
    'away_Momentum_Corners',
    'away_Trend_Slope_Corners',
    'away_Rolling_Corners_1h_Mean',
    'away_Rolling_Corners_1h_Std',
    'away_Rolling_Corners_1h_Mean_Short',
    'away_Momentum_Corners_1h',
    'away_Trend_Slope_Corners_1h',
    'away_Rolling_ShotsOnTarget_Mean',
    'away_Rolling_ShotsOnTarget_Std',
    'away_Rolling_ShotsOnTarget_Mean_Short',
    'away_Momentum_ShotsOnTarget',
    'away_Trend_Slope_ShotsOnTarget',
    'away_Rolling_ShotsOnTarget_1h_Mean',
    'away_Rolling_ShotsOnTarget_1h_Std',
    'away_Rolling_ShotsOnTarget_1h_Mean_Short',
    'away_Momentum_ShotsOnTarget_1h',
    'away_Trend_Slope_ShotsOnTarget_1h',
    'away_Overall_Percent_Over_1.5',
    'away_Overall_Rolling5_Percent_Over_1.5',
    'away_Percent_Over_1.5',
    'away_Rolling5_Percent_Over_1.5',
    'away_Overall_Percent_Over_2.5',
    'away_Overall_Rolling5_Percent_Over_2.5',
    'away_Percent_Over_2.5',
    'away_Rolling5_Percent_Over_2.5',
    'away_Overall_Percent_Over_3.5',
    'away_Overall_Rolling5_Percent_Over_3.5',
    'away_Percent_Over_3.5',
    'away_Rolling5_Percent_Over_3.5',
    'away_TeamPct_Over_0.5',
    'away_TeamPct_Over_1.5',
    'away_TeamPct_Over_2.5',
    'away_TeamPct_Over_3.5',
    'away_CornersPct_Over_3.5',
    'away_CornersRolling5Pct_Over_3.5',
    'away_CornersPct_Over_4.5',
    'away_CornersRolling5Pct_Over_4.5',
    'away_CornersPct_Over_5.5',
    'away_CornersRolling5Pct_Over_5.5',
    'away_CornersPct_Over_6.5',
    'away_CornersRolling5Pct_Over_6.5',
    'away_SeasonPct_Over_9.5',
    'away_Rolling5Pct_Over_9.5',
    'away_SeasonPct_Over_10.5',
    'away_Rolling5Pct_Over_10.5',
    'away_SeasonPct_Over_11.5',
    'away_Rolling5Pct_Over_11.5'
]

In [12]:
filtered_data = match_df[(match_df['date'].dt.date >= today) & (match_df['date'].dt.date <= end_period)]
filtered_data

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_5.5,away_CornersRolling5Pct_Over_5.5,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5
12225,Kor1,25,2025-05-23,1130,15,Anyang,Pohang Steelers,0,0,0,...,0.666667,0.80,0.666667,0.80,0.357143,0.6,0.357143,0.6,0.357143,0.6
12226,Ice1,25,2025-05-23,2030,8,KR Reykjavik,Fram,0,0,0,...,0.000000,0.00,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.000000,0.0
12227,Chl1,25,2025-05-23,2300,12,Limache,U de Chile,0,0,0,...,0.250000,0.25,0.250000,0.25,0.300000,0.0,0.300000,0.0,0.300000,0.0
12228,Kor1,25,2025-05-23,1130,15,Jeju Utd,Jeonbuk,0,0,0,...,0.000000,0.00,0.000000,0.00,0.428571,0.6,0.428571,0.6,0.285714,0.4
12229,Ire1,25,2025-05-23,1945,17,Shelbourne,Sligo Rovers,0,0,0,...,0.250000,0.20,0.000000,0.00,0.500000,0.8,0.437500,0.6,0.312500,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12379,Spa1,25,2025-05-25,1700,38,Vallecano,Mallorca,0,0,0,...,0.055556,0.00,0.000000,0.00,0.432432,0.6,0.351351,0.6,0.270270,0.6
12380,Spa1,25,2025-05-25,1700,38,Real Madrid,Real Sociedad,0,0,0,...,0.333333,0.20,0.222222,0.00,0.621622,0.8,0.459459,0.6,0.378378,0.4
12381,Spa1,25,2025-05-25,1700,38,Villarreal,FC Sevilla,0,0,0,...,0.333333,0.20,0.111111,0.00,0.459459,0.2,0.351351,0.0,0.189189,0.0
12382,Spa2,25,2025-05-25,1700,41,Albacete,Ferrol,0,0,0,...,0.200000,0.20,0.050000,0.00,0.475000,0.4,0.225000,0.2,0.150000,0.2


In [13]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from joblib import load

# Assume that:
# 1. 'filtered_data' is your DataFrame filtered by dates.
# 2. 'features' is a list of your feature column names.
feature_columns = features  # e.g. features = ['feat1', 'feat2', 'feat3']

# Directory where the models are saved.
model_dir = r"/Goals/ARCHIVE/league/path_league/to/save/models"

# List to store prediction DataFrames per country.
predictions_list = []

# Iterate over each unique country in filtered_data.
for country in filtered_data['country'].unique():
    # Filter the dataframe for the current country.
    country_df = filtered_data[filtered_data['country'] == country].copy()

    # Construct the model filename.
    # Expected filename pattern: trained_model_('CountryName',).pkl
    model_filename = os.path.join(model_dir, f"trained_model_({repr(country)},).pkl")

    # If the model file does not exist, skip this country.
    if not os.path.exists(model_filename):
        print(f"Model file not found for country: {country}. Skipping.")
        continue

    # Load the model file using joblib.
    try:
        model_data = load(model_filename)
    except Exception as e:
        print(f"Error loading model file for {country}: {e}")
        continue

    # Debug: print the type of the loaded object.
    print(f"Country: {country} | Loaded model_data type: {type(model_data)}")

    # In your case the pickle files contain a dictionary.
    if isinstance(model_data, dict):
        # Extract the pipeline (i.e. your model) and threshold.
        pipeline = model_data.get('pipeline')
        threshold = model_data.get('threshold', 0.5)
        # Optionally you can also see the 'league' or 'smote_level' if needed:
        # league_info = model_data.get('league')
        # smote_level = model_data.get('smote_level')
        if pipeline is None:
            print(f"No pipeline found in model file for country: {country}. Skipping.")
            continue
    else:
        print(f"Model data for country {country} is not in the expected dictionary format. Skipping.")
        continue

    # Prepare the features for prediction.
    X = country_df[feature_columns]

    # Try to generate predictions.
    try:
        # If the model has predict_proba, use it to create probability predictions.
        if hasattr(pipeline, "predict_proba"):
            proba = pipeline.predict_proba(X)[:, 1]  # assuming binary classification; take the positive class probability.
            preds = (proba >= threshold).astype(int)
            country_df['prediction_proba'] = proba
        else:
            preds = pipeline.predict(X)
    except Exception as e:
        print(f"Error making predictions for country {country}: {e}")
        continue

    # Add the prediction column to the DataFrame.
    country_df['prediction'] = preds
    predictions_list.append(country_df)

# Merge predictions from all countries into a single DataFrame.
if predictions_list:
    all_predictions = pd.concat(predictions_list, ignore_index=True)
    print(all_predictions.head())
else:
    print("No predictions to merge.")


Country: Kor1 | Loaded model_data type: <class 'dict'>
Country: Ice1 | Loaded model_data type: <class 'dict'>
Model file not found for country: Chl1. Skipping.
Model file not found for country: Ire1. Skipping.
Country: Slo1 | Loaded model_data type: <class 'dict'>
Country: Jap1 | Loaded model_data type: <class 'dict'>
Model file not found for country: USA1. Skipping.
Country: Nor1 | Loaded model_data type: <class 'dict'>
Country: Swe1 | Loaded model_data type: <class 'dict'>
Country: Bra1 | Loaded model_data type: <class 'dict'>
Country: Swe2 | Loaded model_data type: <class 'dict'>
Country: Aus2 | Loaded model_data type: <class 'dict'>
Country: Cro1 | Loaded model_data type: <class 'dict'>
Country: Hun1 | Loaded model_data type: <class 'dict'>
Country: Pol1 | Loaded model_data type: <class 'dict'>
Country: Jap2 | Loaded model_data type: <class 'dict'>
Country: Tur1 | Loaded model_data type: <class 'dict'>
Country: Ita1 | Loaded model_data type: <class 'dict'>
Country: Eng1 | Loaded mo

In [14]:
all_predictions

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5,prediction_proba,prediction
0,Kor1,25,2025-05-23,1130,15,Anyang,Pohang Steelers,0,0,0,...,0.666667,0.8,0.357143,0.6,0.357143,0.6,0.357143,0.6,0.962706,1
1,Kor1,25,2025-05-23,1130,15,Jeju Utd,Jeonbuk,0,0,0,...,0.000000,0.0,0.428571,0.6,0.428571,0.6,0.285714,0.4,0.845934,1
2,Kor1,25,2025-05-24,830,15,Seoul,Suwon City,0,0,0,...,0.142857,0.2,0.428571,0.4,0.357143,0.4,0.214286,0.4,0.954205,1
3,Kor1,25,2025-05-24,1100,15,Daejeon Citizen,Daegu,0,0,0,...,0.000000,0.0,0.500000,0.6,0.500000,0.6,0.500000,0.6,0.869373,1
4,Kor1,25,2025-05-24,1100,15,Ulsan,Gimcheon Sangmu,0,0,0,...,0.375000,0.6,0.428571,0.6,0.285714,0.4,0.071429,0.2,0.951107,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,Spa2,25,2025-05-25,1700,41,Eibar,Cordoba,0,0,0,...,0.250000,0.2,0.500000,0.6,0.325000,0.6,0.250000,0.4,0.562622,0
130,Spa2,25,2025-05-25,1700,41,Eldense,Santander,0,0,0,...,0.200000,0.6,0.525000,1.0,0.375000,0.4,0.325000,0.2,0.722292,0
131,Spa2,25,2025-05-25,1700,41,Burgos CF,Levante,0,0,0,...,0.150000,0.0,0.650000,0.4,0.300000,0.4,0.275000,0.4,0.779850,1
132,Spa2,25,2025-05-25,1700,41,Cadiz,Huesca,0,0,0,...,0.100000,0.2,0.525000,0.6,0.300000,0.4,0.225000,0.4,0.556503,0


In [15]:
positive_pred = all_predictions[all_predictions['prediction']==1]
#positive_pred

In [16]:
import function_library as fl
import importlib
importlib.reload(fl)

positive_pred = fl.team_name_map(positive_pred)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[home_col] = dataframe[home_col].map(mapping_dict).fillna(dataframe[home_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[away_col] = dataframe[away_col].map(mapping_dict).fillna(dataframe[away_col])


In [17]:
positive_pred

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5,prediction_proba,prediction
0,Kor1,25,2025-05-23,1130,15,FC Anyang,Pohang Steelers,0,0,0,...,0.666667,0.8,0.357143,0.6,0.357143,0.6,0.357143,0.6,0.962706,1
1,Kor1,25,2025-05-23,1130,15,Jeju Utd,Jeonbuk Motors,0,0,0,...,0.0,0.0,0.428571,0.6,0.428571,0.6,0.285714,0.4,0.845934,1
2,Kor1,25,2025-05-24,830,15,FC Seoul,Suwon FC,0,0,0,...,0.142857,0.2,0.428571,0.4,0.357143,0.4,0.214286,0.4,0.954205,1
3,Kor1,25,2025-05-24,1100,15,Daejeon Citizen,Daegu FC,0,0,0,...,0.0,0.0,0.5,0.6,0.5,0.6,0.5,0.6,0.869373,1
4,Kor1,25,2025-05-24,1100,15,Ulsan Hyundai Horang-i,Gimcheon Sangmu,0,0,0,...,0.375,0.6,0.428571,0.6,0.285714,0.4,0.071429,0.2,0.951107,1
6,Ice1,25,2025-05-23,2030,8,KR Reykjavik,Fram,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.899238,1
8,Ice1,25,2025-05-24,1800,8,Valur,IBV,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.716833,1
9,Ice1,25,2025-05-24,2015,8,IF Vestri,Stjarnan,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.705333,1
10,Ice1,25,2025-05-24,2015,8,Vikingur Reykjavik,IA Akranes,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.827333,1
11,Ice1,25,2025-05-25,2015,8,Hafnarfjordur,Breidablik,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.740667,1


In [18]:
fl.create_import_file(positive_pred, r"C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_05.csv", provider = "second_half_goal", market_name="Over/Under 0.5 Goals", selection_name="Under 0.5 Goals")

fl.create_import_file(positive_pred, r"C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_15.csv", provider = "second_half_goal", market_name="Over/Under 1.5 Goals", selection_name="Under 1.5 Goals")

fl.create_import_file(positive_pred, r"C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_25.csv", provider = "second_half_goal", market_name="Over/Under 2.5 Goals", selection_name="Under 2.5 Goals")

fl.create_import_file(positive_pred, r"C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_35.csv", provider = "second_half_goal", market_name="Over/Under 3.5 Goals", selection_name="Under 3.5 Goals")

fl.create_import_file(positive_pred, r"C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_45.csv", provider = "second_half_goal", market_name="Over/Under 4.5 Goals", selection_name="Under 4.5 Goals")

fl.create_import_file(positive_pred, r"C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_55.csv", provider = "second_half_goal", market_name="Over/Under 5.5 Goals", selection_name="Under 5.5 Goals")

fl.create_import_file(positive_pred, r"C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_65.csv", provider = "second_half_goal", market_name="Over/Under 6.5 Goals", selection_name="Under 6.5 Goals")

File created and saved successfully at: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_05.csv
File created and saved successfully at: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_15.csv
File created and saved successfully at: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_25.csv
File created and saved successfully at: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_35.csv
File created and saved successfully at: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_45.csv
File created and saved successfully at: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_55.csv
File created and saved successfully at: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_65.csv


In [19]:
# import pandas as pd
# import glob
# import os
#
# # Directory where your import CSV files are saved.
# input_dir = r"C:\Users\leere\OneDrive\Desktop\IMPORTS"
#
# # Define the output file path_league.
# output_file = os.path.join(input_dir, "2H_GOAL_LEAGUE_ALL.csv")
#
# # Define a pattern to match your CSV files that start with '2H_GOAL_'.
# csv_files = glob.glob(os.path.join(input_dir, "2H_GOAL_LEAGUE*.csv"))
#
# # Exclude the merged file if it exists.
# csv_files = [f for f in csv_files if os.path.basename(f) != "2H_GOAL_LEAGUE_ALL.csv"]
#
# print("Found files to merge:", csv_files)
#
# # List to hold dataframes.
# dfs = []
#
# # Iterate through the file paths and read each CSV.
# for file in csv_files:
#     df = pd.read_csv(file)
#     dfs.append(df)
#
# # Concatenate all data frames into one.
# merged_df = pd.concat(dfs, ignore_index=True)
#
# # Save (and thus overwrite) the merged DataFrame to a new CSV file.
# merged_df.to_csv(output_file, index=False)
#
# print(f"Merged file saved to {output_file}")

import pandas as pd
import glob
import os

# Directory where your import CSV files are saved.
input_dir = r"C:\Users\leere\OneDrive\Desktop\IMPORTS"

# Define the output file path for the merged league file.
output_file = os.path.join(input_dir, "2H_GOAL_LEAGUE_ALL.csv")

# Find all CSVs starting with '2H_GOAL_LEAGUE'
csv_files = glob.glob(os.path.join(input_dir, "2H_GOAL_LEAGUE*.csv"))

# Exclude the merged file if it already exists
csv_files = [f for f in csv_files if os.path.basename(f) != "2H_GOAL_LEAGUE_ALL.csv"]

print("Found files to merge:", csv_files)

# Read and accumulate
dfs = [pd.read_csv(f) for f in csv_files]

# Concatenate into one DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Save (and thus overwrite if it exists) the merged DataFrame
merged_df.to_csv(output_file, index=False)
print(f"Merged file saved to {output_file}")

# Now remove the original files
for f in csv_files:
    try:
        os.remove(f)
        print(f"Deleted source file: {f}")
    except OSError as e:
        print(f"Could not delete {f}: {e}")


Found files to merge: ['C:\\Users\\leere\\OneDrive\\Desktop\\IMPORTS\\2H_GOAL_LEAGUE_05.csv', 'C:\\Users\\leere\\OneDrive\\Desktop\\IMPORTS\\2H_GOAL_LEAGUE_15.csv', 'C:\\Users\\leere\\OneDrive\\Desktop\\IMPORTS\\2H_GOAL_LEAGUE_25.csv', 'C:\\Users\\leere\\OneDrive\\Desktop\\IMPORTS\\2H_GOAL_LEAGUE_35.csv', 'C:\\Users\\leere\\OneDrive\\Desktop\\IMPORTS\\2H_GOAL_LEAGUE_45.csv', 'C:\\Users\\leere\\OneDrive\\Desktop\\IMPORTS\\2H_GOAL_LEAGUE_55.csv', 'C:\\Users\\leere\\OneDrive\\Desktop\\IMPORTS\\2H_GOAL_LEAGUE_65.csv']
Merged file saved to C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_ALL.csv
Deleted source file: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_05.csv
Deleted source file: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_15.csv
Deleted source file: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_25.csv
Deleted source file: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_GOAL_LEAGUE_35.csv
Deleted source file: C:\Users\leere\OneDrive\Desktop\IMPORTS\2H_