In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default



In [2]:
# 1. Load datasets
matches = pd.read_excel('Big_Five_Leagues_2023_2025.xlsx')
standings = pd.read_excel('Leagues_Standings_Analysis (1).xlsx')

# 2. Merge data to get both teams' standings in each match
# Merge home team with its league position and points
df = pd.merge(
    matches,
    standings[['Team_Name', 'Position', 'Points']],
    left_on='Home_Team',
    right_on='Team_Name',
    how='left'
)

# Rename columns to clearly distinguish home team data
df.rename(columns={'Position': 'Home_Pos', 'Points': 'Home_Pts'}, inplace=True)

# Merge away team with its league position and points
df = pd.merge(
    df,
    standings[['Team_Name', 'Position', 'Points']],
    left_on='Away_Team',
    right_on='Team_Name',
    how='left'
)

# Rename columns to clearly distinguish away team data
df.rename(columns={'Position': 'Away_Pos', 'Points': 'Away_Pts'}, inplace=True)

# 3. Calculate position difference
# This feature represents the ranking gap between home and away teams
df['Pos_Diff'] = df['Home_Pos'] - df['Away_Pos']

print("Data preparation and table merging completed successfully.")
display(df[['Home_Team', 'Away_Team', 'Home_Pos', 'Away_Pos', 'Pos_Diff']].head())


Data preparation and table merging completed successfully.


Unnamed: 0,Home_Team,Away_Team,Home_Pos,Away_Pos,Pos_Diff
0,Burnley FC,Manchester City FC,19.0,1.0,18.0
1,Burnley FC,Manchester City FC,19.0,3.0,16.0
2,Burnley FC,Manchester City FC,19.0,2.0,17.0
3,Burnley FC,Manchester City FC,19.0,1.0,18.0
4,Burnley FC,Manchester City FC,19.0,3.0,16.0


In [3]:
# Assuming 'get_match_result' and 'encode_result' functions are not already defined
# Define functions to determine match outcome and encode it numerically

def get_match_result(row):
    if row['Home_Goals'] > row['Away_Goals']:
        return 'HOME_TEAM'
    elif row['Home_Goals'] < row['Away_Goals']:
        return 'AWAY_TEAM'
    else:
        return 'DRAW'

def encode_result(winner):
    if winner == 'HOME_TEAM':
        return 2
    if winner == 'DRAW':
        return 1
    if winner == 'AWAY_TEAM':
        return 0
    return None

# Add match result (categorical) and encoded result (numeric) columns
df['Match_Result'] = df.apply(get_match_result, axis=1)
df['Result_Numeric'] = df['Match_Result'].apply(encode_result)

# 1. Remove rows with missing values to avoid issues during model training
df_final = df.dropna(subset=['Home_Pos', 'Away_Pos', 'Result_Numeric'])

# 2. Define features (X) and target variable (y)
# Features include team positions, points, and position difference
features = ['Home_Pos', 'Away_Pos', 'Home_Pts', 'Away_Pts', 'Pos_Diff']
X = df_final[features]
y = df_final['Result_Numeric']

# 3. Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Build and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Calculate win confidence scores
# Predict class probabilities for the test set
probs = model.predict_proba(X_test)

# Confidence score is the maximum probability among all possible outcomes
confidences = probs.max(axis=1) * 100

print("Model training completed successfully.")
print(f"Current accuracy: {accuracy_score(y_test, model.predict(X_test)) * 100:.2f}%")


Model training completed successfully.
Current accuracy: 39.60%


In [4]:
# 1. Data preparation
# Make sure the previous data loading cells have been executed
# Merge additional standings data: goals scored (GF), goals conceded (GA), and goal difference (GD)

standings_cols = [
    'Team_Name', 'Position', 'Points',
    'Goals_For', 'Goals_Against', 'Goal_Difference'
]

# Merge home team statistics
df_improved = pd.merge(
    matches,
    standings[standings_cols],
    left_on='Home_Team',
    right_on='Team_Name',
    how='left'
)

df_improved.rename(
    columns={
        'Position': 'Home_Pos',
        'Points': 'Home_Pts',
        'Goals_For': 'Home_GF',
        'Goals_Against': 'Home_GA',
        'Goal_Difference': 'Home_GD'
    },
    inplace=True
)

# Merge away team statistics
df_improved = pd.merge(
    df_improved,
    standings[standings_cols],
    left_on='Away_Team',
    right_on='Team_Name',
    how='left'
)

df_improved.rename(
    columns={
        'Position': 'Away_Pos',
        'Points': 'Away_Pts',
        'Goals_For': 'Away_GF',
        'Goals_Against': 'Away_GA',
        'Goal_Difference': 'Away_GD'
    },
    inplace=True
)

# 2. Feature Engineering
# Create differential features to better represent team strength gaps
df_improved['Pos_Diff'] = df_improved['Home_Pos'] - df_improved['Away_Pos']
df_improved['Pts_Diff'] = df_improved['Home_Pts'] - df_improved['Away_Pts']
df_improved['GD_Diff'] = df_improved['Home_GD'] - df_improved['Away_GD']

# 3. Encode match results numerically
# 2 = Home Win, 1 = Draw, 0 = Away Win
def encode_result(row):
    if row['Home_Goals'] > row['Away_Goals']:
        return 2
    if row['Home_Goals'] == row['Away_Goals']:
        return 1
    return 0

df_improved['Result_Numeric'] = df_improved.apply(encode_result, axis=1)

# 4. Data cleaning
# Remove rows with missing values to ensure stable model training
df_final = df_improved.dropna(
    subset=['Home_Pos', 'Away_Pos', 'Result_Numeric']
)

# 5. Feature selection
# Use standings, points, and goal-related differences
features = [
    'Home_Pos', 'Away_Pos',
    'Home_Pts', 'Away_Pts',
    'Pos_Diff', 'Pts_Diff', 'GD_Diff'
]

X = df_final[features]
y = df_final['Result_Numeric']

# 6. Train-test split and model training
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Increase the number of trees to improve model performance
model = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=5,
    random_state=42
)

model.fit(X_train, y_train)

# 7. Model evaluation and confidence calculation
y_pred = model.predict(X_test)
probs = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Model updated with additional features (Goal Difference and Points Difference).")
print(f"New accuracy: {accuracy * 100:.2f}%")

# If performance improves, this model can be used to predict 2026 matches


Model updated with additional features (Goal Difference and Points Difference).
New accuracy: 43.78%


In [7]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default


# 1. Model initialization (XGBoost)
# Balanced hyperparameters are chosen to reduce the risk of overfitting
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    objective='multi:softprob'  # Multiclass classification (0, 1, 2)
)

# 2. Model training
xgb_model.fit(X_train, y_train)

# 3. Prediction and accuracy evaluation
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"XGBoost model accuracy: {accuracy_xgb * 100:.2f}%")

# -------------------------------------------------------------
# Fix feature mismatch by reconstructing future match features
# The same feature engineering logic used during training is reapplied here
# -------------------------------------------------------------

# 1. Access Google Sheet
creds, _ = default()
gc = gspread.authorize(creds)

# 2. Open the file (replace 'Matches_2026' with your actual file name)
# Ensure the file contains Home_Team, Away_Team, and Season columns
ss = gc.open('Football Match')
ws = ss.get_worksheet(0) # First tab in the sheet
future_matches = pd.DataFrame(ws.get_all_records())

# Ensure 'standings' is already loaded (it is in the kernel state)
standings_cols = [
    'Team_Name', 'Position', 'Points',
    'Goals_For', 'Goals_Against', 'Goal_Difference'
]

# Merge statistics for the home team
future_df_corrected = pd.merge(
    future_matches,
    standings[standings_cols],
    left_on='Home_Team',
    right_on='Team_Name',
    how='left'
)

future_df_corrected.rename(
    columns={
        'Position': 'Home_Pos',
        'Points': 'Home_Pts',
        'Goals_For': 'Home_GF',
        'Goals_Against': 'Home_GA',
        'Goal_Difference': 'Home_GD'
    },
    inplace=True
)

# Merge statistics for the away team
future_df_corrected = pd.merge(
    future_df_corrected,
    standings[standings_cols],
    left_on='Away_Team',
    right_on='Team_Name',
    how='left'
)

future_df_corrected.rename(
    columns={
        'Position': 'Away_Pos',
        'Points': 'Away_Pts',
        'Goals_For': 'Away_GF',
        'Goals_Against': 'Away_GA',
        'Goal_Difference': 'Away_GD'
    },
    inplace=True
)

# Feature engineering: calculate relative strength differences
future_df_corrected['Pos_Diff'] = future_df_corrected['Home_Pos'] - future_df_corrected['Away_Pos']
future_df_corrected['Pts_Diff'] = future_df_corrected['Home_Pts'] - future_df_corrected['Away_Pts']
future_df_corrected['GD_Diff'] = future_df_corrected['Home_GD'] - future_df_corrected['Away_GD']

# Handle missing values that may appear after merging
# This is important for teams missing from the standings table
future_df_corrected = future_df_corrected.fillna(0)

# Define the final feature set, aligned exactly with the training features
features_for_prediction = [
    'Home_Pos', 'Away_Pos',
    'Home_Pts', 'Away_Pts',
    'Pos_Diff', 'Pts_Diff', 'GD_Diff'
]

X_future_corrected = future_df_corrected[features_for_prediction]

# 4. Win confidence estimation for future matches
# XGBoost provides reliable class probability estimates
future_probs_xgb = xgb_model.predict_proba(X_future_corrected)

# Store confidence scores in the future matches DataFrame
future_df = future_df_corrected
future_df['Win_Confidence'] = future_probs_xgb.max(axis=1) * 100

XGBoost model accuracy: 51.70%


  future_df_corrected = future_df_corrected.fillna(0)


In [9]:
# 1. Connect to Google Sheets
creds, _ = default()
gc = gspread.authorize(creds)

# 2. Open the Google Sheet
# Make sure the sheet contains: Home_Team, Away_Team, and Season columns
ss = gc.open('Football Match')
ws = ss.get_worksheet(0)  # First worksheet
future_matches = pd.DataFrame(ws.get_all_records())

# 3. Merge current standings data with 2026 matches
# This provides the model with up-to-date team strength information
standings_cols = [
    'Team_Name', 'Position', 'Points',
    'Goals_For', 'Goals_Against', 'Goal_Difference'
]

# Merge home team statistics
future_df = pd.merge(
    future_matches,
    standings[standings_cols],
    left_on='Home_Team',
    right_on='Team_Name',
    how='left'
)

future_df.rename(
    columns={
        'Position': 'Home_Pos',
        'Points': 'Home_Pts',
        'Goals_For': 'Home_GF',
        'Goals_Against': 'Home_GA',
        'Goal_Difference': 'Home_GD'
    },
    inplace=True
)

# Merge away team statistics
future_df = pd.merge(
    future_df,
    standings[standings_cols],
    left_on='Away_Team',
    right_on='Team_Name',
    how='left'
)

future_df.rename(
    columns={
        'Position': 'Away_Pos',
        'Points': 'Away_Pts',
        'Goals_For': 'Away_GF',
        'Goals_Against': 'Away_GA',
        'Goal_Difference': 'Away_GD'
    },
    inplace=True
)

# 4. Feature engineering
# Create relative strength features required by the trained model
future_df['Pos_Diff'] = future_df['Home_Pos'] - future_df['Away_Pos']
future_df['Pts_Diff'] = future_df['Home_Pts'] - future_df['Away_Pts']
future_df['GD_Diff'] = future_df['Home_GD'] - future_df['Away_GD']

# 5. Handle missing values from Google Sheets or standings merges
future_df = future_df.fillna(0)

# 6. Generate model predictions
# Ensure feature order matches the training phase
features_for_prediction = [
    'Home_Pos', 'Away_Pos',
    'Home_Pts', 'Away_Pts',
    'Pos_Diff', 'Pts_Diff', 'GD_Diff'
]

X_future = future_df[features_for_prediction]
future_df['AI_Prediction_Numeric'] = model.predict(X_future)

# 7. Calculate win confidence scores
# These probabilities are later used for heat map coloring in Power BI
future_probs = model.predict_proba(X_future)
future_df['Win_Confidence'] = future_probs.max(axis=1) * 100

# 8. Decode numeric predictions into readable labels for visualization
def decode_res(num):
    if num == 2:
        return 'Home Win'
    if num == 1:
        return 'Draw'
    return 'Away Win'

future_df['AI_Prediction'] = future_df['AI_Prediction_Numeric'].apply(decode_res)

print("Predictions are ready.")
display(
    future_df[['Home_Team', 'Away_Team', 'AI_Prediction', 'Win_Confidence']].head()
)


Predictions are ready.


  future_df = future_df.fillna(0)


Unnamed: 0,Home_Team,Away_Team,AI_Prediction,Win_Confidence
0,US Cremonese,Hellas Verona FC,Away Win,45.223647
1,,,Away Win,45.223647
2,,,Away Win,45.223647
3,,,Away Win,45.223647
4,,,Away Win,45.223647
