In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
%matplotlib inline

In [None]:

data = pd.read_csv('premier_league_result_odds.csv')
# Calculate implied probabilities
data['home_implied'] = 1 / data['home_odds']
data['draw_implied'] = 1 / data['draw_odds']
data['away_implied'] = 1 / data['away_odds']

# Calculate bookmaker margin (overround)
data['overround'] = data['home_implied'] + data['draw_implied'] + data['away_implied'] - 1

# Calculate true probabilities (normalized)
data['home_prob'] = data['home_implied'] / (1 + data['overround'])
data['draw_prob'] = data['draw_implied'] / (1 + data['overround'])
data['away_prob'] = data['away_implied'] / (1 + data['overround'])


# # Calculate value indicators
# data['home_value'] = data['home_prob'] - data['home_implied']
# data['away_value'] = data['away_prob'] - data['away_implied']


In [None]:
# Define bins globally (or at least at same scope level as both functions)
ODDS_BINS = [1, 1.5, 2, 2.5, 3, 4, 5, 10, float('inf')]
ODDS_LABELS = [f"{ODDS_BINS[i]}-{ODDS_BINS[i+1]}" for i in range(len(ODDS_BINS)-1)]

def precompute_team_performance(data):
    team_features = {}
    
    for team in set(data['home_team']).union(set(data['away_team'])):
        team_matches = data[(data['home_team'] == team) | (data['away_team'] == team)]
        
        # Home performance
        home_perf = team_matches[team_matches['home_team'] == team].copy()
        home_perf['odds_bin'] = pd.cut(home_perf['home_odds'], 
                                     bins=ODDS_BINS,
                                     labels=ODDS_LABELS)
        home_win_rate = home_perf.groupby('odds_bin')['winning_outcome'].apply(
            lambda x: (x == 'Home').mean()
        ).to_dict()
        
        # Away performance
        away_perf = team_matches[team_matches['away_team'] == team].copy()
        away_perf['odds_bin'] = pd.cut(away_perf['away_odds'], 
                                     bins=ODDS_BINS,
                                     labels=ODDS_LABELS)
        away_win_rate = away_perf.groupby('odds_bin')['winning_outcome'].apply(
            lambda x: (x == 'Away').mean()
        ).to_dict()
        
        team_features[team] = {
            'home_win_rates': home_win_rate,
            'away_win_rates': away_win_rate
        }
    
    return team_features

In [None]:
# Precompute once (save this object)
team_performance = precompute_team_performance(data)

In [None]:
def get_performance_feature(row):
    home_team = row['home_team']
    away_team = row['away_team']
    home_odd = row['home_odds']
    away_odd = row['away_odds']
    
    # Find which bin the current odds falls into
    def find_bin(odds):
        for i in range(len(ODDS_BINS)-1):
            if ODDS_BINS[i] <= odds < ODDS_BINS[i+1]:
                return ODDS_LABELS[i]
        return ODDS_LABELS[-1]  # default to last bin
    
    home_bin = find_bin(home_odd)
    away_bin = find_bin(away_odd)
    
    return pd.Series({
        'home_win_rate': team_performance[home_team]['home_win_rates'].get(home_bin, 0.5),
        'away_win_rate': team_performance[away_team]['away_win_rates'].get(away_bin, 0.3)
    })

In [None]:

# Apply to dataframe
data[['home_win_rate', 'away_win_rate']] = data.apply(get_performance_feature, axis=1)

# Create odds-based features only
X = data[['home_odds', 'draw_odds', 'away_odds', 
        'home_prob', 'draw_prob', 'away_prob',
        'overround',
        'home_win_rate', 'away_win_rate']]
y = data['winning_outcome']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=190)

In [None]:

# Train model
model = GradientBoostingClassifier(n_estimators=200, max_depth=4)
model.fit(X_train, y_train)

In [None]:
def get_win_rate(team, is_home, odds):
    """Helper function to lookup team performance at given odds"""
    # Find which bin these odds fall into
    for i in range(len(ODDS_BINS)-1):
        if ODDS_BINS[i] <= odds < ODDS_BINS[i+1]:
            bin_label = ODDS_LABELS[i]
            break
    else:
        bin_label = ODDS_LABELS[-1]  # default to last bin
    
    # Get the appropriate win rates dictionary
    rates_dict = team_performance[team]['home_win_rates' if is_home else 'away_win_rates']
    return rates_dict.get(bin_label, 0.5 if is_home else 0.3)

In [None]:
# 2. Prediction function for new matches
def predict_match(home_team, away_team, home_odds, draw_odds, away_odds):
    # Calculate basic odds features
    home_implied = 1 / home_odds
    draw_implied = 1 / draw_odds
    away_implied = 1 / away_odds
    overround = (home_implied + draw_implied + away_implied) - 1
    home_prob = home_implied / (1 + overround)
    draw_prob = draw_implied / (1 + overround)
    away_prob = away_implied / (1 + overround)
    
    home_win_rate = get_win_rate(home_team, True, home_odds)
    away_win_rate = get_win_rate(away_team, False, away_odds)
    
    # Create feature array IN THE EXACT ORDER USED IN TRAINING
    features = [
        home_odds, draw_odds, away_odds,
        home_prob, draw_prob, away_prob,
        overround,
        home_win_rate, away_win_rate
    ]
    
    # Make prediction
    prediction = model.predict([features])[0]
    probabilities = model.predict_proba([features])[0]
    
    return prediction, probabilities

In [None]:
# Make predictions on the test set
home_team = "Liverpool"
away_team = "Bournemouth"
home_odds = 1.38
draw_odds = 5.23
away_odds = 6.96
prediction, probs = predict_match(home_team, away_team, home_odds, draw_odds, away_odds)
print(f"Predicted outcome: {prediction}")
print(f"Probabilities: Home={probs[0]:.2f}, Draw={probs[1]:.2f}, Away={probs[2]:.2f}")


In [None]:
# # Identify the minimum odds among 'Home', 'Away', and 'Draw'
# data['min_odd'] = data[['Home', 'Away', 'Draw']].min(axis=1)

# # Count the occurrences of each result for the minimum odds
# result_counts = data.groupby('min_odd')['Result'].value_counts().fillna(0)

# # print(result_counts)


In [None]:
# # Train SVM
# model = SVC()
# model.fit(X_train, y_train)
# predictions = model.predict(X_test)


# # Make predictions on the test set
# test_odds = [[3.28,3.38,2.3]]
# test_pred = model.predict(test_odds)

# # Evaluate the model
# cross_val_accuracy = cross_val_score(model, X, y, cv=10, scoring='accuracy')
# conf_matrix = confusion_matrix(y_test, predictions)
# class_report = classification_report(y_test, predictions)
# print(test_pred)
# print(f'Cross-Validation Accuracy: {cross_val_accuracy.mean()}')
# print(conf_matrix)
# print(class_report)


In [None]:
# from sklearn.model_selection import GridSearchCV

# # Implement GridSearch
# param_grid = {"C": [0.1,1,10,100], "gamma": [1,0.1,0.01,0.001]}
# grid = GridSearchCV(SVC(), param_grid, verbose=3)
# grid.fit(X_train, y_train)
# grid_prediction = grid.predict(X_test)
# conf_matrix = confusion_matrix(y_test, grid_prediction)
# print(conf_matrix)
# class_report = classification_report(y_test, grid_prediction)
# print(f'Cross-Validation Accuracy: {cross_val_accuracy.mean()}')
# print(classification_report(y_test, grid_prediction))
