In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load data
match_df = pd.read_csv('../data/cleaned/match_df_cleaned.csv')
team_df = pd.read_csv('../data/raw/Team.csv')

# Convert date to datetime
match_df['date'] = pd.to_datetime(match_df['date'])

# Filter matches before 2013 to avoid data leakage
train_match_df = match_df[match_df['date'] < '2013-01-01'].copy()

# Compute average player overall ratings
home_player_cols = [f'home_player_{i}_overall' for i in range(1, 12)]
away_player_cols = [f'away_player_{i}_overall' for i in range(1, 12)]
match_df['avg_home_player_overall'] = match_df[home_player_cols].mean(axis=1)
match_df['avg_away_player_overall'] = match_df[away_player_cols].mean(axis=1)
train_match_df['avg_home_player_overall'] = train_match_df[home_player_cols].mean(axis=1)
train_match_df['avg_away_player_overall'] = train_match_df[away_player_cols].mean(axis=1)

# Add is_home feature (1 for home team, 0 for away team)
# For training, we define it from the home team's perspective
train_match_df['is_home'] = 1  # All rows are home team perspective
# For away team perspective, create a mirrored dataset
train_match_df_away = train_match_df.copy()
train_match_df_away['is_home'] = 0
# Swap home and away features for away perspective
train_match_df_away = train_match_df_away.rename(columns={
    'home_team_api_id': 'away_team_api_id',
    'away_team_api_id': 'home_team_api_id',
    'home_team_goal': 'away_team_goal',
    'away_team_goal': 'home_team_goal',
    'avg_home_player_overall': 'avg_away_player_overall',
    'avg_away_player_overall': 'avg_home_player_overall',
    'prob_home': 'prob_away',
    'prob_away': 'prob_home',
    'betting_median_home': 'betting_median_away',
    'betting_median_away': 'betting_median_home'
})
# Combine home and away perspectives
train_match_df = pd.concat([train_match_df, train_match_df_away], ignore_index=True)

# Add recent form features
def calculate_form(df, team_id, date, n_matches=5):
    past_matches = df[
        ((df['home_team_api_id'] == team_id) | (df['away_team_api_id'] == team_id)) &
        (df['date'] < date)
    ].tail(n_matches)
    if len(past_matches) == 0:
        return 0, 0
    goals_scored = 0
    goals_conceded = 0
    for _, row in past_matches.iterrows():
        if row['home_team_api_id'] == team_id and row['is_home'] == 1:
            goals_scored += row['home_team_goal']
            goals_conceded += row['away_team_goal']
        elif row['away_team_api_id'] == team_id and row['is_home'] == 0:
            goals_scored += row['away_team_goal']
            goals_conceded += row['home_team_goal']
    return goals_scored / len(past_matches), goals_conceded / len(past_matches)

# Add form features to full dataset
match_df['Home_Form_Scored'] = 0.0
match_df['Home_Form_Conceded'] = 0.0
match_df['Away_Form_Scored'] = 0.0
match_df['Away_Form_Conceded'] = 0.0
match_df['is_home'] = match_df.apply(
    lambda row: 1 if row['home_team_api_id'] == row['home_team_api_id'] else 0, axis=1
)

for idx, row in match_df.iterrows():
    home_scored, home_conceded = calculate_form(match_df, row['home_team_api_id'], row['date'])
    away_scored, away_conceded = calculate_form(match_df, row['away_team_api_id'], row['date'])
    match_df.at[idx, 'Home_Form_Scored'] = home_scored
    match_df.at[idx, 'Home_Form_Conceded'] = home_conceded
    match_df.at[idx, 'Away_Form_Scored'] = away_scored
    match_df.at[idx, 'Away_Form_Conceded'] = away_conceded

# Update training dataset
train_match_df = pd.concat([
    match_df[match_df['date'] < '2014-31-12'].copy(),
    train_match_df_away
], ignore_index=True)

# Get betting odds columns
home_cols = [col for col in train_match_df.columns if col.endswith('H')]
draw_cols = [col for col in train_match_df.columns if col.endswith('D')]
away_cols = [col for col in train_match_df.columns if col.endswith('A')]

# Drop rows with missing odds or player overalls
train_match_df.dropna(subset=home_cols + draw_cols + away_cols + home_player_cols + away_player_cols, how='any', inplace=True)

# Create team name mapping
team_dict = team_df.set_index('team_api_id')['team_long_name'].to_dict()

# Prepare features and targets
features = [
    'prob_home', 'prob_draw', 'prob_away',
    'betting_median_home', 'betting_median_draw', 'betting_median_away',
    'Home_Form_Scored', 'Home_Form_Conceded', 'Away_Form_Scored', 'Away_Form_Conceded',
    'avg_home_player_overall', 'avg_away_player_overall', 'is_home'
]
target_team_goals = 'home_team_goal'  # For is_home=1, predicts home goals; for is_home=0, predicts away goals

# Split the pre-2013 dataset
train_df, val_df = train_test_split(train_match_df, test_size=2/3, random_state=42)

# Train a single model for team goals (home or away based on is_home)
model_team_goals = RandomForestRegressor(n_estimators=100, random_state=42)
model_team_goals.fit(train_df[features], train_df[target_team_goals])

# Evaluate on validation set
team_pred_val = np.round(model_team_goals.predict(val_df[features])).astype(int)
mae_team = mean_absolute_error(val_df[target_team_goals], team_pred_val)

print(f"Model trained on one-third of matches before 2013-01-01 ({len(train_df)} matches).")
print(f"Validation MAE (Team Goals): {mae_team:.2f}")

Model trained on one-third of matches before 2013-01-01 (1728 matches).
Validation MAE (Team Goals): 0.93


In [30]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming 'match_df', 'team_df', 'model_team_goals', 'team_dict', and 'features' from Cell Block 1

# Prompt user for team and season
team_name = input("Enter the team name (e.g., Real Madrid CF): ")
start_date = input("Enter the start date of the season (YYYY-MM-DD): ")
end_date = input("Enter the end date of the season (YYYY-MM-DD): ")

# Validate team name
filtered_team = team_df[team_df['team_long_name'] == team_name]
if filtered_team.empty:
    raise ValueError(f"Team '{team_name}' not found. Check available names with team_df['team_long_name'].unique()")

# Get team ID
team_id = filtered_team['team_api_id'].values[0]

# Filter matches for the selected team and season
team_matches = match_df[
    ((match_df['home_team_api_id'] == team_id) | (match_df['away_team_api_id'] == team_id)) &
    (match_df['date'] >= start_date) & (match_df['date'] <= end_date)
].copy()

# Add Venue and is_home from the selected team's perspective
team_matches['Venue'] = team_matches.apply(
    lambda row: 'Home' if row['home_team_api_id'] == team_id else 'Away', axis=1
)
team_matches['is_home'] = team_matches['Venue'].map({'Home': 1, 'Away': 0})

# Add Opponent column
team_matches['Opponent'] = team_matches.apply(
    lambda row: team_dict.get(row['away_team_api_id']) if row['Venue'] == 'Home'
    else team_dict.get(row['home_team_api_id']), axis=1
)

# Adjust features for team perspective
team_matches['team_goals'] = team_matches.apply(
    lambda row: row['home_team_goal'] if row['Venue'] == 'Home' else row['away_team_goal'], axis=1
)
team_matches['opponent_goals'] = team_matches.apply(
    lambda row: row['away_team_goal'] if row['Venue'] == 'Home' else row['home_team_goal'], axis=1
)
team_matches['prob_team'] = team_matches.apply(
    lambda row: row['prob_home'] if row['Venue'] == 'Home' else row['prob_away'], axis=1
)
team_matches['prob_opponent'] = team_matches.apply(
    lambda row: row['prob_away'] if row['Venue'] == 'Home' else row['prob_home'], axis=1
)
team_matches['betting_median_team'] = team_matches.apply(
    lambda row: row['betting_median_home'] if row['Venue'] == 'Home' else row['betting_median_away'], axis=1
)
team_matches['betting_median_opponent'] = team_matches.apply(
    lambda row: row['betting_median_away'] if row['Venue'] == 'Home' else row['betting_median_home'], axis=1
)
team_matches['avg_team_player_overall'] = team_matches.apply(
    lambda row: row['avg_home_player_overall'] if row['Venue'] == 'Home' else row['avg_away_player_overall'], axis=1
)
team_matches['avg_opponent_player_overall'] = team_matches.apply(
    lambda row: row['avg_away_player_overall'] if row['Venue'] == 'Home' else row['avg_home_player_overall'], axis=1
)
team_matches['Team_Form_Scored'] = team_matches.apply(
    lambda row: row['Home_Form_Scored'] if row['Venue'] == 'Home' else row['Away_Form_Scored'], axis=1
)
team_matches['Team_Form_Conceded'] = team_matches.apply(
    lambda row: row['Home_Form_Conceded'] if row['Venue'] == 'Home' else row['Away_Form_Conceded'], axis=1
)
team_matches['Opponent_Form_Scored'] = team_matches.apply(
    lambda row: row['Away_Form_Scored'] if row['Venue'] == 'Home' else row['Home_Form_Scored'], axis=1
)
team_matches['Opponent_Form_Conceded'] = team_matches.apply(
    lambda row: row['Away_Form_Conceded'] if row['Venue'] == 'Home' else row['Home_Form_Conceded'], axis=1
)

# Prepare features for prediction
X_team = team_matches[[
    'prob_team', 'prob_draw', 'prob_opponent',
    'betting_median_team', 'betting_median_draw', 'betting_median_opponent',
    'Team_Form_Scored', 'Team_Form_Conceded', 'Opponent_Form_Scored', 'Opponent_Form_Conceded',
    'avg_team_player_overall', 'avg_opponent_player_overall', 'is_home'
]].rename(columns={
    'prob_team': 'prob_home',
    'prob_opponent': 'prob_away',
    'betting_median_team': 'betting_median_home',
    'betting_median_opponent': 'betting_median_away',
    'avg_team_player_overall': 'avg_home_player_overall',
    'avg_opponent_player_overall': 'avg_away_player_overall',
    'Team_Form_Scored': 'Home_Form_Scored',
    'Team_Form_Conceded': 'Home_Form_Conceded',
    'Opponent_Form_Scored': 'Away_Form_Scored',
    'Opponent_Form_Conceded': 'Away_Form_Conceded'
})

# Predict team goals
team_pred = np.round(model_team_goals.predict(X_team)).astype(int)

# Since we only predict team goals, assume opponent goals are mirrored (simplified approach)
# For better accuracy, consider training a separate model for opponent goals
team_matches['Predicted_Team_Goals'] = team_pred
team_matches['Predicted_Opponent_Goals'] = team_matches.apply(
    lambda row: np.round(model_team_goals.predict([[
        row['prob_opponent'], row['prob_draw'], row['prob_team'],
        row['betting_median_opponent'], row['betting_median_draw'], row['betting_median_team'],
        row['Opponent_Form_Scored'], row['Opponent_Form_Conceded'], row['Team_Form_Scored'], row['Team_Form_Conceded'],
        row['avg_opponent_player_overall'], row['avg_team_player_overall'], 1 if row['Venue'] == 'Away' else 0
    ]])[0]).astype(int), axis=1
)

# Actual goals
team_matches['Actual_Team_Goals'] = team_matches['team_goals']
team_matches['Actual_Opponent_Goals'] = team_matches['opponent_goals']

# Determine outcomes
def get_outcome(team_goals, opp_goals):
    if team_goals > opp_goals:
        return 'Win'
    elif team_goals == opp_goals:
        return 'Draw'
    else:
        return 'Loss'

team_matches['Predicted_Outcome'] = team_matches.apply(
    lambda row: get_outcome(row['Predicted_Team_Goals'], row['Predicted_Opponent_Goals']), axis=1
)
team_matches['Actual_Outcome'] = team_matches.apply(
    lambda row: get_outcome(row['Actual_Team_Goals'], row['Actual_Opponent_Goals']), axis=1
)

# Calculate points
def calculate_points(outcome):
    if outcome == 'Win':
        return 3
    elif outcome == 'Draw':
        return 1
    else:
        return 0

team_matches['Actual_Points'] = team_matches['Actual_Outcome'].apply(calculate_points)
team_matches['Predicted_Points'] = team_matches['Predicted_Outcome'].apply(calculate_points)

# Create tables
predictions_table = team_matches[['date', 'Opponent', 'Venue', 'Predicted_Team_Goals', 'Predicted_Opponent_Goals', 'Predicted_Outcome']]
predictions_table['Predicted_Score'] = predictions_table['Predicted_Team_Goals'].astype(str) + ' - ' + predictions_table['Predicted_Opponent_Goals'].astype(str)

actual_table = team_matches[['date', 'Opponent', 'Venue', 'Actual_Team_Goals', 'Actual_Opponent_Goals', 'Actual_Outcome']]
actual_table['Actual_Score'] = actual_table['Actual_Team_Goals'].astype(str) + ' - ' + actual_table['Actual_Opponent_Goals'].astype(str)

# Sort by date
predictions_table = predictions_table.sort_values(by='date').reset_index(drop=True)
actual_table = actual_table.sort_values(by='date').reset_index(drop=True)

# Format dates
predictions_table['date'] = pd.to_datetime(predictions_table['date']).dt.strftime('%Y-%m-%d')
actual_table['date'] = pd.to_datetime(actual_table['date']).dt.strftime('%Y-%m-%d')

# Display settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Display tables
print(f"\n=== Predictions for {team_name} from {start_date} to {end_date} ===")
display(predictions_table[['date', 'Opponent', 'Venue', 'Predicted_Score', 'Predicted_Outcome']])

print(f"\n=== Actual Results for {team_name} from {start_date} to {end_date} ===")
display(actual_table[['date', 'Opponent', 'Venue', 'Actual_Score', 'Actual_Outcome']])




=== Predictions for Real Madrid CF from 2015-08-01 to 2016-05-30 ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictions_table['Predicted_Score'] = predictions_table['Predicted_Team_Goals'].astype(str) + ' - ' + predictions_table['Predicted_Opponent_Goals'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_table['Actual_Score'] = actual_table['Actual_Team_Goals'].astype(str) + ' - ' + actual_table['Actual_Opponent_Goals'].astype(str)


Unnamed: 0,date,Opponent,Venue,Predicted_Score,Predicted_Outcome
0,2015-08-23,Real Sporting de Gijón,Away,3 - 1,Win
1,2015-08-29,Real Betis Balompié,Home,4 - 1,Win
2,2015-09-12,RCD Espanyol,Away,3 - 1,Win
3,2015-09-19,Granada CF,Home,4 - 1,Win
4,2015-09-23,Athletic Club de Bilbao,Away,2 - 1,Win
5,2015-09-26,Málaga CF,Home,4 - 1,Win
6,2015-10-04,Atlético Madrid,Away,2 - 1,Win
7,2015-10-17,Levante UD,Home,4 - 1,Win
8,2015-10-24,RC Celta de Vigo,Away,2 - 2,Draw
9,2015-10-31,UD Las Palmas,Home,4 - 1,Win



=== Actual Results for Real Madrid CF from 2015-08-01 to 2016-05-30 ===


Unnamed: 0,date,Opponent,Venue,Actual_Score,Actual_Outcome
0,2015-08-23,Real Sporting de Gijón,Away,0 - 0,Draw
1,2015-08-29,Real Betis Balompié,Home,5 - 0,Win
2,2015-09-12,RCD Espanyol,Away,6 - 0,Win
3,2015-09-19,Granada CF,Home,1 - 0,Win
4,2015-09-23,Athletic Club de Bilbao,Away,2 - 1,Win
5,2015-09-26,Málaga CF,Home,0 - 0,Draw
6,2015-10-04,Atlético Madrid,Away,1 - 1,Draw
7,2015-10-17,Levante UD,Home,3 - 0,Win
8,2015-10-24,RC Celta de Vigo,Away,3 - 1,Win
9,2015-10-31,UD Las Palmas,Home,3 - 1,Win


In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming team_matches is a DataFrame with the required columns
# Calculate statistics
num_matches = len(team_matches)
actual_avg_team_goals = team_matches['Actual_Team_Goals'].mean()
actual_avg_opp_goals = team_matches['Actual_Opponent_Goals'].mean()
actual_record = team_matches['Actual_Outcome'].value_counts()
predicted_avg_team_goals = team_matches['Predicted_Team_Goals'].mean()
predicted_avg_opp_goals = team_matches['Predicted_Opponent_Goals'].mean()
predicted_record = team_matches['Predicted_Outcome'].value_counts()

# Calculate season scores
actual_points = team_matches['Actual_Outcome'].map({'Win': 3, 'Draw': 1, 'Loss': 0}).sum()
predicted_points = team_matches['Predicted_Outcome'].map({'Win': 3, 'Draw': 1, 'Loss': 0}).sum()

# Calculate metrics
mae_team = mean_absolute_error(team_matches['Actual_Team_Goals'], team_matches['Predicted_Team_Goals'])
mae_opp = mean_absolute_error(team_matches['Actual_Opponent_Goals'], team_matches['Predicted_Opponent_Goals'])
mse_team = mean_squared_error(team_matches['Actual_Team_Goals'], team_matches['Predicted_Team_Goals'])
mse_opp = mean_squared_error(team_matches['Actual_Opponent_Goals'], team_matches['Predicted_Opponent_Goals'])
rmse_team = np.sqrt(mse_team)
rmse_opp = np.sqrt(mse_opp)
r2_team = r2_score(team_matches['Actual_Team_Goals'], team_matches['Predicted_Team_Goals'])
r2_opp = r2_score(team_matches['Actual_Opponent_Goals'], team_matches['Predicted_Opponent_Goals'])

# Calculate outcome accuracy
correct_outcomes = (team_matches['Actual_Outcome'] == team_matches['Predicted_Outcome']).sum()
outcome_accuracy = (correct_outcomes / num_matches) * 100 if num_matches > 0 else 0

# Print statistics and metrics
print(f"\n=== Statistics for {team_name} from {start_date} to {end_date} ({num_matches} matches) ===")
print("\n-- Actual --")
print(f"Average Goals Scored by Team: {actual_avg_team_goals:.2f}")
print(f"Average Goals Conceded: {actual_avg_opp_goals:.2f}")
print(f"Record (W-D-L): {actual_record.get('Win', 0)}-{actual_record.get('Draw', 0)}-{actual_record.get('Loss', 0)}")
print(f"Season Score: {actual_points} points")
print("\n-- Predicted --")
print(f"Average Goals Scored by Team: {predicted_avg_team_goals:.2f}")
print(f"Average Goals Conceded: {predicted_avg_opp_goals:.2f}")
print(f"Record (W-D-L): {predicted_record.get('Win', 0)}-{predicted_record.get('Draw', 0)}-{predicted_record.get('Loss', 0)}")
print(f"Season Score: {predicted_points} points")
'''print("\n-- Model Performance Metrics --")
print(f"MAE (Team Goals): {mae_team:.2f}")
print(f"MAE (Opponent Goals): {mae_opp:.2f}")
print(f"MSE (Team Goals): {mse_team:.2f}")
print(f"MSE (Opponent Goals): {mse_opp:.2f}")
print(f"RMSE (Team Goals): {rmse_team:.2f}")
print(f"RMSE (Opponent Goals): {rmse_opp:.2f}")
print(f"R² (Team Goals): {r2_team:.2f}")
print(f"R² (Opponent Goals): {r2_opp:.2f}")'''
print("\n-- Model Performance Metrics --")
print(f"Outcome Accuracy: {outcome_accuracy:.2f}%")


=== Statistics for Real Madrid CF from 2015-08-01 to 2016-05-30 (38 matches) ===

-- Actual --
Average Goals Scored by Team: 2.89
Average Goals Conceded: 0.89
Record (W-D-L): 28-6-4
Season Score: 90 points

-- Predicted --
Average Goals Scored by Team: 2.84
Average Goals Conceded: 1.29
Record (W-D-L): 29-6-3
Season Score: 93 points

-- Model Performance Metrics --
Outcome Accuracy: 65.79%
