In [None]:
import pandas as pd
import numpy as np

# Set team and season (change these values as needed)
team_name = 'Córdoba CF'  # Replace with any team name from the dataset
start_date = '2014-08-01'   # Start date of the season
end_date = '2015-06-30'     # End date of the season

# Load data
match_df = pd.read_csv('../data/raw/Match.csv')
team_df = pd.read_csv('../data/raw/Team.csv')

# Convert date to datetime
match_df['date'] = pd.to_datetime(match_df['date'])

# Get the team_api_id for the selected team
team_id = team_df[team_df['team_long_name'] == team_name]['team_api_id'].values[0]

# Filter matches for the selected team within the date range
team_matches = match_df[
    ((match_df['home_team_api_id'] == team_id) | (match_df['away_team_api_id'] == team_id)) &
    (match_df['date'] >= start_date) & (match_df['date'] <= end_date)
].copy()

print(f"Total matches for {team_name} from {start_date} to {end_date}: {len(team_matches)}")

# Get all columns for home/draw/away odds
home_cols = [col for col in match_df.columns if col.endswith('H')]
draw_cols = [col for col in match_df.columns if col.endswith('D')]
away_cols = [col for col in match_df.columns if col.endswith('A')]

# Drop rows where all odds are missing
team_matches.dropna(subset=home_cols + draw_cols + away_cols, how='all', inplace=True)
print(f"Matches with at least some betting odds: {len(team_matches)}")

# Define match outcome from the team's perspective
def get_outcome(row):
    if row['home_team_api_id'] == team_id:
        if row['home_team_goal'] > row['away_team_goal']:
            return 'Win'
        elif row['home_team_goal'] == row['away_team_goal']:
            return 'Draw'
        else:
            return 'Loss'
    else:
        if row['away_team_goal'] > row['home_team_goal']:
            return 'Win'
        elif row['away_team_goal'] == row['home_team_goal']:
            return 'Draw'
        else:
            return 'Loss'

team_matches['outcome'] = team_matches.apply(get_outcome, axis=1)

# Calculate average odds across all sources
team_matches['Avg_Home_Odds'] = team_matches[home_cols].mean(axis=1)
team_matches['Avg_Draw_Odds'] = team_matches[draw_cols].mean(axis=1)
team_matches['Avg_Away_Odds'] = team_matches[away_cols].mean(axis=1)

# Set odds from the team's perspective
def get_team_avg_odds(row):
    if row['home_team_api_id'] == team_id:
        return pd.Series({
            'Team_Win_Odds': row['Avg_Home_Odds'],
            'Draw_Odds': row['Avg_Draw_Odds'],
            'Team_Loss_Odds': row['Avg_Away_Odds']
        })
    else:
        return pd.Series({
            'Team_Win_Odds': row['Avg_Away_Odds'],
            'Draw_Odds': row['Avg_Draw_Odds'],
            'Team_Loss_Odds': row['Avg_Home_Odds']
        })

team_matches = team_matches.join(team_matches.apply(get_team_avg_odds, axis=1))

# Add score column (format: "Home Goals - Away Goals")
team_matches['Score'] = team_matches['home_team_goal'].astype(str) + ' - ' + team_matches['away_team_goal'].astype(str)

# Add Venue column
team_matches['Venue'] = team_matches.apply(
    lambda row: 'Home' if row['home_team_api_id'] == team_id else 'Away', axis=1
)

# Add Opponent column
team_dict = team_df.set_index('team_api_id')['team_long_name'].to_dict()
team_matches['Opponent'] = team_matches.apply(
    lambda row: team_dict.get(row['away_team_api_id']) if row['home_team_api_id'] == team_id 
    else team_dict.get(row['home_team_api_id']), axis=1
)

# Create final summary table
summary = team_matches[['date', 'Opponent', 'Venue', 'Score', 'Team_Win_Odds', 'Draw_Odds', 'Team_Loss_Odds', 'outcome']]
summary = summary.sort_values(by='date').reset_index(drop=True)

# Set pandas display options to show all rows and columns without truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Display the summary
print(f"{team_name} matches from {start_date} to {end_date}: {len(summary)}")

Total matches for Real Madrid CF from 2014-08-01 to 2015-06-30: 38
Matches with at least some betting odds: 38
Real Madrid CF matches from 2014-08-01 to 2015-06-30: 38


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
from IPython.display import display  # For Jupyter; falls back to print if unavailable

# Assuming 'summary' is the DataFrame from Cell Block 1 with columns: date, Opponent, Venue, Score, Team_Win_Odds, Draw_Odds, Team_Loss_Odds, outcome
season_df = summary.copy()

# Assign goals based on Venue
def get_goals(row):
    home_goals, away_goals = row['Score'].split(' - ')
    home_goals = int(home_goals.strip())
    away_goals = int(away_goals.strip())
    if row['Venue'] == 'Home':
        return pd.Series({'Team_Goals': home_goals, 'Opponent_Goals': away_goals})
    else:
        return pd.Series({'Team_Goals': away_goals, 'Opponent_Goals': home_goals})

season_df[['Team_Goals', 'Opponent_Goals']] = season_df.apply(get_goals, axis=1)

# Verify assignments (optional debugging)
print("Sample goal assignments:")
print(season_df[['date', 'Opponent', 'Venue', 'Score', 'Team_Goals', 'Opponent_Goals']].head())

# Features and targets
features = ['Team_Win_Odds', 'Draw_Odds', 'Team_Loss_Odds']
target_team = 'Team_Goals'
target_opp = 'Opponent_Goals'

# Train-test split
train_df, test_df = train_test_split(season_df, test_size=0.5, random_state=None)
X_train = train_df[features]
X_test = test_df[features]
y_train_team = train_df[target_team]
y_train_opp = train_df[target_opp]
y_test_team = test_df[target_team]
y_test_opp = test_df[target_opp]

# Train models
model_team = RandomForestRegressor(n_estimators=100, random_state=None)
model_opp = RandomForestRegressor(n_estimators=100, random_state=None)
model_team.fit(X_train, y_train_team)
model_opp.fit(X_train, y_train_opp)

# Predict and round
team_pred = np.round(model_team.predict(X_test)).astype(int)
opp_pred = np.round(model_opp.predict(X_test)).astype(int)

# Prepare results
test_results = test_df[['date', 'Opponent', 'Venue', 'Score']].copy()
test_results['Predicted_Score'] = team_pred.astype(str) + ' - ' + opp_pred.astype(str)
test_results['Actual_Team'] = y_test_team.values
test_results['Actual_Opponent'] = y_test_opp.values
test_results['Actual_Score'] = test_results['Actual_Team'].astype(str) + ' - ' + test_results['Actual_Opponent'].astype(str)

# Outcome function
def get_outcome(team_goals, opp_goals):
    if team_goals > opp_goals:
        return 'Win'
    elif team_goals == opp_goals:
        return 'Draw'
    else:
        return 'Loss'

# Calculate outcomes
test_results['Actual_Outcome'] = test_results.apply(
    lambda row: get_outcome(row['Actual_Team'], row['Actual_Opponent']), axis=1
)
test_results['Predicted_Outcome'] = test_results.apply(
    lambda row: get_outcome(
        team_pred[test_results.index.get_loc(row.name)],
        opp_pred[test_results.index.get_loc(row.name)]
    ), axis=1
)

# Calculate points
def calculate_points(outcome):
    if outcome == 'Win':
        return 3
    elif outcome == 'Draw':
        return 1
    else:
        return 0

test_results['Actual_Points'] = test_results['Actual_Outcome'].apply(calculate_points)
test_results['Predicted_Points'] = test_results['Predicted_Outcome'].apply(calculate_points)

# Sort and format
test_results = test_results.sort_values(by='date').reset_index(drop=True)
display_table = test_results[['date', 'Opponent', 'Venue', 'Predicted_Score', 'Actual_Score', 
                             'Predicted_Outcome', 'Actual_Outcome', 'Predicted_Points', 'Actual_Points']]
display_table['date'] = pd.to_datetime(display_table['date']).dt.strftime('%Y-%m-%d')

# Display settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Display results
print(f"\n=== Predicted vs Actual Scores for {team_name} from {start_date} to {end_date} ===")
try:
    display(display_table)
except:
    print(display_table.to_string(index=False))

# Metrics
mae_team = mean_absolute_error(y_test_team, team_pred)
mae_opp = mean_absolute_error(y_test_opp, opp_pred)
print(f"\nMean Absolute Error ({team_name} goals): {mae_team:.2f}")
print(f"Mean Absolute Error (Opponent goals): {mae_opp:.2f}")
print(f"Total Actual Points: {test_results['Actual_Points'].sum()}")
print(f"Total Predicted Points: {test_results['Predicted_Points'].sum()}")

Sample goal assignments:
        date                   Opponent Venue  Score  Team_Goals  \
0 2014-08-25                 Córdoba CF  Home  2 - 0           2   
1 2014-08-31              Real Sociedad  Away  4 - 2           2   
2 2014-09-13            Atlético Madrid  Home  1 - 2           1   
3 2014-09-20  RC Deportivo de La Coruña  Away  2 - 8           8   
4 2014-09-23                   Elche CF  Home  5 - 1           5   

   Opponent_Goals  
0               0  
1               4  
2               2  
3               2  
4               1  

=== Predicted vs Actual Scores for Real Madrid CF from 2014-08-01 to 2015-06-30 ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  display_table['date'] = pd.to_datetime(display_table['date']).dt.strftime('%Y-%m-%d')


Unnamed: 0,date,Opponent,Venue,Predicted_Score,Actual_Score,Predicted_Outcome,Actual_Outcome,Predicted_Points,Actual_Points
0,2014-08-25,Córdoba CF,Home,4 - 1,2 - 0,Win,Win,3,3
1,2014-09-20,RC Deportivo de La Coruña,Away,2 - 1,8 - 2,Win,Win,3,3
2,2014-11-01,Granada CF,Away,4 - 0,4 - 0,Win,Win,3,3
3,2014-11-29,Málaga CF,Away,2 - 2,2 - 1,Draw,Win,1,3
4,2014-12-06,RC Celta de Vigo,Home,4 - 1,3 - 0,Win,Win,3,3
5,2014-12-12,UD Almería,Away,3 - 1,4 - 1,Win,Win,3,3
6,2015-01-10,RCD Espanyol,Home,5 - 1,3 - 0,Win,Win,3,3
7,2015-01-18,Getafe CF,Away,3 - 0,3 - 0,Win,Win,3,3
8,2015-01-24,Córdoba CF,Away,4 - 1,2 - 1,Win,Win,3,3
9,2015-02-14,RC Deportivo de La Coruña,Home,5 - 1,2 - 0,Win,Win,3,3



Mean Absolute Error (Real Madrid CF goals): 1.74
Mean Absolute Error (Opponent goals): 0.63
Total Actual Points: 50
Total Predicted Points: 50
