In [26]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load data
match_df = pd.read_csv('../data/raw/Match.csv')
team_df = pd.read_csv('../data/raw/Team.csv')

# Convert date to datetime
match_df['date'] = pd.to_datetime(match_df['date'])

# Get all columns for home/draw/away odds
home_cols = [col for col in match_df.columns if col.endswith('H')]
draw_cols = [col for col in match_df.columns if col.endswith('D')]
away_cols = [col for col in match_df.columns if col.endswith('A')]

# Drop rows where all odds are missing
match_df.dropna(subset=home_cols + draw_cols + away_cols, how='all', inplace=True)

# Calculate average odds across all sources
match_df['Avg_Home_Odds'] = match_df[home_cols].mean(axis=1)
match_df['Avg_Draw_Odds'] = match_df[draw_cols].mean(axis=1)
match_df['Avg_Away_Odds'] = match_df[away_cols].mean(axis=1)

# Add score column (format: "Home Goals - Away Goals")
match_df['Score'] = match_df['home_team_goal'].astype(str) + ' - ' + match_df['away_team_goal'].astype(str)

# Add Venue column (from the perspective of the home team)
match_df['Venue'] = 'Home'

# Create a dictionary for team_api_id to team_long_name mapping
team_dict = team_df.set_index('team_api_id')['team_long_name'].to_dict()

# Add Opponent column
match_df['Opponent'] = match_df['away_team_api_id'].map(team_dict)

# Prepare features and targets
# Using average odds as features for simplicity
features = ['Avg_Home_Odds', 'Avg_Draw_Odds', 'Avg_Away_Odds']
target_home = 'home_team_goal'
target_away = 'away_team_goal'

# Split the entire dataset into training and testing sets (1/3 for training)
train_df, test_df = train_test_split(match_df, test_size=1/3, random_state=42)

# Train models for home and away goals
model_home = RandomForestRegressor(n_estimators=100, random_state=42)
model_away = RandomForestRegressor(n_estimators=100, random_state=42)

model_home.fit(train_df[features], train_df[target_home])
model_away.fit(train_df[features], train_df[target_away])

print("Model trained on one-third of the entire dataset.")

Model trained on one-third of the entire dataset.


In [27]:
import pandas as pd
import numpy as np
from IPython.display import display

# Assuming 'match_df', 'team_df', 'model_home', 'model_away', and 'team_dict' are available from Cell Block 1

# Prompt user to choose a team and season
team_name = input("Enter the team name: ")
start_date = input("Enter the start date of the season (YYYY-MM-DD): ")
end_date = input("Enter the end date of the season (YYYY-MM-DD): ")

# Get the team_api_id for the selected team
team_id = team_df[team_df['team_long_name'] == team_name]['team_api_id'].values[0]

# Filter matches for the selected team within the date range
team_matches = match_df[
    ((match_df['home_team_api_id'] == team_id) | (match_df['away_team_api_id'] == team_id)) &
    (match_df['date'] >= start_date) & (match_df['date'] <= end_date)
].copy()

# Add Venue column from the team's perspective
team_matches['Venue'] = team_matches.apply(
    lambda row: 'Home' if row['home_team_api_id'] == team_id else 'Away', axis=1
)

# Add Opponent column
team_matches['Opponent'] = team_matches.apply(
    lambda row: team_dict.get(row['away_team_api_id']) if row['home_team_api_id'] == team_id 
    else team_dict.get(row['home_team_api_id']), axis=1
)

# Prepare features for prediction
X_team = team_matches[features]

# Predict home and away goals
home_pred = np.round(model_home.predict(X_team)).astype(int)
away_pred = np.round(model_away.predict(X_team)).astype(int)

# Adjust predictions based on the team's venue
team_matches['Predicted_Team_Goals'] = team_matches.apply(
    lambda row: home_pred[team_matches.index.get_loc(row.name)] if row['Venue'] == 'Home' 
    else away_pred[team_matches.index.get_loc(row.name)], axis=1
)
team_matches['Predicted_Opponent_Goals'] = team_matches.apply(
    lambda row: away_pred[team_matches.index.get_loc(row.name)] if row['Venue'] == 'Home' 
    else home_pred[team_matches.index.get_loc(row.name)], axis=1
)

# Actual goals
team_matches['Actual_Team_Goals'] = team_matches.apply(
    lambda row: row['home_team_goal'] if row['Venue'] == 'Home' else row['away_team_goal'], axis=1
)
team_matches['Actual_Opponent_Goals'] = team_matches.apply(
    lambda row: row['away_team_goal'] if row['Venue'] == 'Home' else row['home_team_goal'], axis=1
)

# Function to determine outcome
def get_outcome(team_goals, opp_goals):
    if team_goals > opp_goals:
        return 'Win'
    elif team_goals == opp_goals:
        return 'Draw'
    else:
        return 'Loss'

# Predicted and actual outcomes
team_matches['Predicted_Outcome'] = team_matches.apply(
    lambda row: get_outcome(row['Predicted_Team_Goals'], row['Predicted_Opponent_Goals']), axis=1
)
team_matches['Actual_Outcome'] = team_matches.apply(
    lambda row: get_outcome(row['Actual_Team_Goals'], row['Actual_Opponent_Goals']), axis=1
)

# Create predictions table
predictions_table = team_matches[['date', 'Opponent', 'Venue', 'Predicted_Team_Goals', 'Predicted_Opponent_Goals', 'Predicted_Outcome']]
predictions_table['Predicted_Score'] = predictions_table['Predicted_Team_Goals'].astype(str) + ' - ' + predictions_table['Predicted_Opponent_Goals'].astype(str)

# Create actual results table
actual_table = team_matches[['date', 'Opponent', 'Venue', 'Actual_Team_Goals', 'Actual_Opponent_Goals', 'Actual_Outcome']]
actual_table['Actual_Score'] = actual_table['Actual_Team_Goals'].astype(str) + ' - ' + actual_table['Actual_Opponent_Goals'].astype(str)

# Sort both tables by date
predictions_table = predictions_table.sort_values(by='date').reset_index(drop=True)
actual_table = actual_table.sort_values(by='date').reset_index(drop=True)

# Display the tables
print(f"\n=== Predictions for {team_name} from {start_date} to {end_date} ===")
display(predictions_table[['date', 'Opponent', 'Venue', 'Predicted_Score', 'Predicted_Outcome']])

print(f"\n=== Actual Results for {team_name} from {start_date} to {end_date} ===")
display(actual_table[['date', 'Opponent', 'Venue', 'Actual_Score', 'Actual_Outcome']])


=== Predictions for Leicester City from 2015-08-08 to 2016-05-09 ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictions_table['Predicted_Score'] = predictions_table['Predicted_Team_Goals'].astype(str) + ' - ' + predictions_table['Predicted_Opponent_Goals'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_table['Actual_Score'] = actual_table['Actual_Team_Goals'].astype(str) + ' - ' + actual_table['Actual_Opponent_Goals'].astype(str)


Unnamed: 0,date,Opponent,Venue,Predicted_Score,Predicted_Outcome
0,2015-08-08,Sunderland,Home,3 - 2,Win
1,2015-08-15,West Ham United,Away,2 - 1,Win
2,2015-08-22,Tottenham Hotspur,Home,1 - 1,Draw
3,2015-08-29,Bournemouth,Away,1 - 1,Draw
4,2015-09-13,Aston Villa,Home,1 - 1,Draw
5,2015-09-19,Stoke City,Away,2 - 1,Win
6,2015-09-26,Arsenal,Home,2 - 4,Loss
7,2015-10-03,Norwich City,Away,2 - 1,Win
8,2015-10-17,Southampton,Away,1 - 1,Draw
9,2015-10-24,Crystal Palace,Home,1 - 1,Draw



=== Actual Results for Leicester City from 2015-08-08 to 2016-05-09 ===


Unnamed: 0,date,Opponent,Venue,Actual_Score,Actual_Outcome
0,2015-08-08,Sunderland,Home,4 - 2,Win
1,2015-08-15,West Ham United,Away,2 - 1,Win
2,2015-08-22,Tottenham Hotspur,Home,1 - 1,Draw
3,2015-08-29,Bournemouth,Away,1 - 1,Draw
4,2015-09-13,Aston Villa,Home,3 - 2,Win
5,2015-09-19,Stoke City,Away,2 - 2,Draw
6,2015-09-26,Arsenal,Home,2 - 5,Loss
7,2015-10-03,Norwich City,Away,2 - 1,Win
8,2015-10-17,Southampton,Away,2 - 2,Draw
9,2015-10-24,Crystal Palace,Home,1 - 0,Win


In [29]:
# Calculate statistics
num_matches = len(team_matches)
actual_avg_team_goals = team_matches['Actual_Team_Goals'].mean()
actual_avg_opp_goals = team_matches['Actual_Opponent_Goals'].mean()
actual_record = team_matches['Actual_Outcome'].value_counts()
predicted_avg_team_goals = team_matches['Predicted_Team_Goals'].mean()
predicted_avg_opp_goals = team_matches['Predicted_Opponent_Goals'].mean()
predicted_record = team_matches['Predicted_Outcome'].value_counts()

# Print statistics
print(f"\n=== Statistics for {team_name} from {start_date} to {end_date} ({num_matches} matches) ===")
print("\n-- Actual --")
print(f"Average Goals Scored by Team: {actual_avg_team_goals:.2f}")
print(f"Average Goals Conceded: {actual_avg_opp_goals:.2f}")
print(f"Record (W-D-L): {actual_record.get('Win', 0)}-{actual_record.get('Draw', 0)}-{actual_record.get('Loss', 0)}")
print("\n-- Predicted --")
print(f"Average Goals Scored by Team: {predicted_avg_team_goals:.2f}")
print(f"Average Goals Conceded: {predicted_avg_opp_goals:.2f}")
print(f"Record (W-D-L): {predicted_record.get('Win', 0)}-{predicted_record.get('Draw', 0)}-{predicted_record.get('Loss', 0)}")


=== Statistics for Leicester City from 2015-08-08 to 2016-05-09 (37 matches) ===

-- Actual --
Average Goals Scored by Team: 1.81
Average Goals Conceded: 0.95
Record (W-D-L): 23-11-3

-- Predicted --
Average Goals Scored by Team: 1.51
Average Goals Conceded: 1.16
Record (W-D-L): 13-20-4
