In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import shap
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("C:\\Desktop\\PROJECT\\football dataset.csv")
data

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Home Goals,Away Goals,Result
0,12/09/2020,12:30,Fulham,Arsenal,0,3,A
1,12/09/2020,15:00,Crystal Palace,Southampton,1,0,H
2,12/09/2020,17:30,Liverpool,Leeds,4,3,H
3,12/09/2020,20:00,West Ham,Newcastle,0,2,A
4,13/09/2020,14:00,West Brom,Leicester,0,3,A
...,...,...,...,...,...,...,...
1895,25/05/2025,16:00,Newcastle,Everton,0,1,A
1896,25/05/2025,16:00,Nott'm Forest,Chelsea,0,1,A
1897,25/05/2025,16:00,Southampton,Arsenal,1,2,A
1898,25/05/2025,16:00,Tottenham,Brighton,1,4,A


In [3]:
# Sort data chronologically
data = data.sort_values(by=["Date"])

In [4]:
# --- 5️⃣ Time feature (convert to numeric hour) ---
data["Hour"] = data["Time"].str.extract(r"(\d+)").astype(float)

In [5]:
from sklearn.preprocessing import LabelEncoder

# Create label encoder instance
label_encoder = LabelEncoder()

# Encode the home team column
data['EncodedHomeTeam'] = label_encoder.fit_transform(data['HomeTeam'])

# Encode the away team column
data['EncodedAwayTeam'] = label_encoder.fit_transform(data['AwayTeam'])


In [6]:
# Convert Result column to binary classification
data["HomeWin"] = data["Result"].map({
    "H": 1,  # Home Win
    "A": 0,  # Away Win
    "D": 0   # Draw is treated as Not Home Win
})

In [7]:
# Weighted Recent Performance (last 5 games)
def recent_form_weighted(data, team_col, goals_for, goals_against, window=5):
    data = data.copy()
    data["Points"] = np.where(data[goals_for] > data[goals_against], 3,
                            np.where(data[goals_for] == data[goals_against], 1, 0))
    form = (
        data.groupby(team_col)["Points"]
        .apply(lambda x: x.rolling(window, min_periods=1).apply(
            lambda s: np.average(s, weights=np.arange(1, len(s)+1)), raw=False))
        .reset_index(level=0, drop=True)
    )
    return form

data["homeform"] = recent_form_weighted(data, "HomeTeam", "Home Goals", "Away Goals")
data["awayform"] = recent_form_weighted(data, "AwayTeam", "Away Goals", "Home Goals")

In [8]:
# Relative Team Strength (dynamic difference in form) ---
data["RelativeTeamStrength"] = data["homeform"] - data["awayform"]

In [9]:
# --- 3️⃣ Head-to-Head Aggregated Results ---
def head_to_head(data):
    h2h = (
        data.groupby(["HomeTeam", "AwayTeam"])
        .agg({
            "Home Goals": "mean",
            "Away Goals": "mean",
            "HomeWin": "mean"
        })
        .rename(columns={"Home Goals": "AvgHomeGoals", "Away Goals": "AvgAwayGoals", "HomeWin": "HeadToHeadWinRate"})
        .reset_index()
    )
    return h2h

h2h_stats = head_to_head(data)
data = data.merge(h2h_stats, on=["HomeTeam", "AwayTeam"], how="left")

# --- 4️⃣ Weighted Recent Matches (combine home & away form) ---
data["WeightedRecentPerformance"] = (
    0.6 * data["homeform"] + 0.4 * (1 - data["awayform"])
)

In [10]:
data

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Home Goals,Away Goals,Result,Hour,EncodedHomeTeam,EncodedAwayTeam,HomeWin,homeform,awayform,RelativeTeamStrength,AvgHomeGoals,AvgAwayGoals,HeadToHeadWinRate,WeightedRecentPerformance
0,01/01/2021,20:00,Man United,Aston Villa,2,1,H,20.0,16,1,1,3.000000,0.000000,3.000000,1.600000,0.800000,0.800000,2.200000
1,01/01/2021,17:30,Everton,West Ham,0,1,A,17.0,8,25,0,0.000000,3.000000,-3.000000,0.600000,1.200000,0.200000,-0.800000
2,01/01/2022,17:30,Crystal Palace,West Ham,2,3,A,17.0,7,25,0,0.000000,3.000000,-3.000000,2.600000,2.600000,0.400000,-0.800000
3,01/01/2022,15:00,Watford,Tottenham,0,1,A,15.0,23,22,0,0.000000,3.000000,-3.000000,0.000000,1.000000,0.000000,-0.800000
4,01/01/2022,12:30,Arsenal,Man City,1,2,A,12.0,0,15,0,0.000000,3.000000,-3.000000,1.600000,1.400000,0.400000,-0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,31/12/2022,15:00,Bournemouth,Crystal Palace,0,2,A,15.0,2,7,0,1.066667,1.800000,-0.733333,0.333333,0.666667,0.333333,0.320000
1896,31/12/2022,12:30,Wolves,Man United,0,1,A,12.0,26,16,0,1.400000,1.800000,-0.400000,1.200000,1.600000,0.200000,0.520000
1897,31/12/2022,15:00,Fulham,Southampton,2,1,H,15.0,9,21,1,2.000000,0.600000,1.400000,0.666667,0.333333,0.333333,1.360000
1898,31/12/2023,14:00,Tottenham,Bournemouth,3,1,H,14.0,22,2,1,2.400000,1.533333,0.866667,2.333333,2.000000,0.333333,1.226667


In [13]:
# --- 6️⃣ Select relevant features ---
features = [
    "EncodedHomeTeam", "EncodedAwayTeam", "Hour",
    "WeightedRecentPerformance", "RelativeTeamStrength", "HeadToHeadWinRate"
]

In [15]:
X = data[features]
y = data["HomeWin"]  

In [17]:
# Split data (70% Train / 30% Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, shuffle=True
)

In [19]:
# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [21]:

# Build the model
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    random_state=42
)
model.fit(X_train, y_train)

In [22]:
# Predictions
y_pred = model.predict(X_test)

In [25]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


print(f"✅ Model Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

✅ Model Accuracy: 0.8807

Confusion Matrix:
 [[286  29]
 [ 39 216]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.89       315
           1       0.88      0.85      0.86       255

    accuracy                           0.88       570
   macro avg       0.88      0.88      0.88       570
weighted avg       0.88      0.88      0.88       570



In [27]:
# Keep only useful columns
final_columns = [
    "EncodedHomeTeam", "EncodedAwayTeam", "Hour",
    "WeightedRecentPerformance", "RelativeTeamStrength",
    "HeadToHeadWinRate", "HomeWin", "HomeTeam", "AwayTeam","Home Goals","Away Goals"
]

data_processed = data[final_columns]
data_processed.to_csv("data_processed.csv", index=False)

In [29]:
# Save encoder for Streamlit App
joblib.dump(label_encoder, "team_encoder.pkl")
# Save Model
joblib.dump(model, "football_model.pkl")
print("✅ team_encoder.pkl & football_model.pkl saved!")

✅ team_encoder.pkl & football_model.pkl saved!
