In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib as jb

ModuleNotFoundError: No module named 'pandas'

In [None]:
data = pd.read_csv("premier_league.csv")

NameError: name 'pd' is not defined

In [None]:
data.dropna(inplace=True)

In [None]:
football_data = data.copy()
del football_data['SourceFile']
football_data['Date'] = pd.to_datetime(football_data['Date'], errors='coerce') 



In [None]:
all_teams = pd.concat([football_data['HomeTeam'], football_data['AwayTeam']])
team_encoder = LabelEncoder()
ref_encoder = LabelEncoder()
team_encoder.fit(all_teams)

football_data['HomeID'] = team_encoder.transform(football_data['HomeTeam'])
football_data['AwayID'] = team_encoder.transform(football_data['AwayTeam'])
football_data['RefereeID'] = ref_encoder.fit_transform(football_data['Referee'])


football_data['Month'] = football_data['Date'].dt.month
football_data['Year'] = football_data['Date'].dt.year


In [None]:
result_map = {'H': 1, 'D': 0, 'A': -1}
football_data['Result'] = football_data['FTR'].map(result_map)
football_data['HResult'] = football_data['HTR'].map(result_map)
football_data = football_data.sort_values("Date")


In [None]:
def calculate_team_form(df, team_name, match_date, N=5):
  
    past_matches = df[
        ((df['HomeTeam'] == team_name) | (df['AwayTeam'] == team_name)) &
        (df['Date'] < match_date)
    ].sort_values(by='Date', ascending=False).head(N)
    
    goals_scored = []
    goals_conceded = []
    results = []

    for _, row in past_matches.iterrows():
        if row['HomeTeam'] == team_name:
            goals_scored.append(row['FTHG'])
            goals_conceded.append(row['FTAG'])
            results.append(row['Result'])
        else:
            goals_scored.append(row['FTAG'])
            goals_conceded.append(row['FTHG'])
            results.append(-row['Result']) 


    if len(past_matches) == 0:
        return pd.Series([0, 0, 0])  
    
    avg_goals = np.mean(goals_scored)
    avg_conceded = np.mean(goals_conceded)
    win_rate = sum(1 for r in results if r == 1) / len(results)

    if ((df["Date"] == pd.to_datetime("2025-12-02")) & (df["HomeTeam"] == 'Everton')).any():
        print(goals_conceded, goals_scored, results)


    return pd.Series([avg_goals, avg_conceded, win_rate])


In [None]:
home_form_stats = football_data.apply(
    lambda row: calculate_team_form(football_data, row['HomeTeam'], row['Date']),
    axis=1
)

away_form_stats = football_data.apply(
    lambda row: calculate_team_form(football_data, row['AwayTeam'], row['Date']),
    axis=1
)

# Assign column names
home_form_stats.columns = ['HomeAvgGoals', 'HomeAvgConceded', 'HomeWinRate']
away_form_stats.columns = ['AwayAvgGoals', 'AwayAvgConceded', 'AwayWinRate']

# Merge into original dataset
football_data = pd.concat([football_data, home_form_stats, away_form_stats], axis=1)
football_data


In [None]:
corr_matrix = football_data.copy()
corr_matrix.drop(columns=corr_matrix.select_dtypes(exclude=[np.number]).columns, inplace=True)
corr_matrix.corr()['Result']

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = football_data[football_data["Date"] < '2022-01-01']
test = football_data[football_data["Date"] > '2022-01-01']

In [None]:
# predictors = ["HomeID", "AwayID", "HResult", "RefereeID", "Month", "Year","FTHG","HTHG","AR","AY","HC","AF","HST","HS"]
# predictors = ["HomeID", "AwayID", "RefereeID", "Month", "Year"]
predictors = [
    "HomeID", "AwayID", "RefereeID", "Month", "Year",
    "HomeAvgGoals", "HomeAvgConceded", "HomeWinRate",
    "AwayAvgGoals", "AwayAvgConceded", "AwayWinRate",
]


In [None]:
rf.fit(train[predictors], train["Result"])

In [None]:
preds = rf.predict(test[predictors])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
acc = accuracy_score(test["Result"], preds)

In [None]:
acc

In [None]:
combine = pd.DataFrame(dict(actual=test["Result"], prediction=preds))

In [None]:
pd.crosstab(index=combine["actual"], columns=combine["prediction"])

In [None]:
def predict_match_with_form(home_team, away_team, referee, match_date_str):
    match_date = pd.to_datetime(match_date_str)
    
    try:
        home_id = team_encoder.transform([home_team])[0]
        away_id = team_encoder.transform([away_team])[0]
    except ValueError:
        print("❗ Unknown team name.")
        return
    
    referee_id = (
        ref_encoder.transform([referee])[0]
        if referee in ref_encoder.classes_
        else -1
    )
    
    home_form = calculate_team_form(football_data, home_team, match_date)
    away_form = calculate_team_form(football_data, away_team, match_date)

    row = pd.DataFrame([{
        "HomeID": home_id,
        "AwayID": away_id,
        "RefereeID": referee_id,
        "Month": match_date.month,
        "Year": match_date.year,
        "HomeAvgGoals": home_form[0],
        "HomeAvgConceded": home_form[1],
        "HomeWinRate": home_form[2],
        "AwayAvgGoals": away_form[0],
        "AwayAvgConceded": away_form[1],
        "AwayWinRate": away_form[2],
    }])
    
    pred = rf.predict(row)[0]
    result_map = {1: "🏠 Home Win", 0: "🤝 Draw", -1: "✈️ Away Win"}
    
    print(f"🔮 Prediction for {home_team} vs {away_team} on {match_date.date()}: {result_map[pred]}")



In [None]:
predict_match_with_form("Burnley", "Arsenal", "T Robinson", "2025-04-19")
