In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
import warnings
warnings.filterwarnings('ignore')

In [36]:
data = pd.read_csv('premier_league_result_odds.csv')

In [37]:
def create_advanced_features(data):
    """Create more sophisticated features for better prediction"""
    
    # Basic implied probability features (your existing code)
    data['home_implied'] = 1 / data['home_odds']
    data['draw_implied'] = 1 / data['draw_odds']
    data['away_implied'] = 1 / data['away_odds']
    data['overround'] = data['home_implied'] + data['draw_implied'] + data['away_implied'] - 1
    data['home_prob'] = data['home_implied'] / (1 + data['overround'])
    data['draw_prob'] = data['draw_implied'] / (1 + data['overround'])
    data['away_prob'] = data['away_implied'] / (1 + data['overround'])
    
    # NEW ADVANCED FEATURES
    
    # 1. Odds ratios and spreads
    data['home_away_odds_ratio'] = data['home_odds'] / data['away_odds']
    data['odds_spread'] = data['home_odds'].abs() - data['away_odds'].abs()
    data['favorite_odds'] = data[['home_odds', 'away_odds']].min(axis=1)
    data['underdog_odds'] = data[['home_odds', 'away_odds']].max(axis=1)
    data['odds_variance'] = data[['home_odds', 'draw_odds', 'away_odds']].var(axis=1)
    
    # 2. Market confidence indicators
    data['market_confidence'] = 1 / data['overround']  # Lower overround = higher confidence
    data['draw_bias'] = data['draw_prob'] - 0.33  # How much market favors draw vs uniform
    data['home_advantage'] = data['home_prob'] - data['away_prob']
    
    # 3. Categorical odds ranges
    data['home_odds_category'] = pd.cut(data['home_odds'], 
                                       bins=[0, 1.5, 2.5, 4.0, float('inf')],
                                       labels=['Heavy_Favorite', 'Favorite', 'Slight_Favorite', 'Underdog'])
    data['away_odds_category'] = pd.cut(data['away_odds'], 
                                       bins=[0, 1.5, 2.5, 4.0, float('inf')],
                                       labels=['Heavy_Favorite', 'Favorite', 'Slight_Favorite', 'Underdog'])
    
    # 4. Match competitiveness
    data['match_competitiveness'] = 1 / (abs(data['home_prob'] - data['away_prob']) + 0.01)
    
    return data

In [38]:
data = create_advanced_features(data)

In [39]:
def enhanced_team_performance(data):
    """Calculate more detailed team performance metrics"""
    team_stats = {}
    
    for team in set(data['home_team']).union(set(data['away_team'])):
        # Home performance
        home_matches = data[data['home_team'] == team]
        home_wins = len(home_matches[home_matches['winning_outcome'] == 'Home'])
        home_total = len(home_matches)
        
        # Away performance  
        away_matches = data[data['away_team'] == team]
        away_wins = len(away_matches[away_matches['winning_outcome'] == 'Away'])
        away_total = len(away_matches)
        
        # Overall performance
        total_matches = home_total + away_total
        total_wins = home_wins + away_wins
        draws = len(data[((data['home_team'] == team) | (data['away_team'] == team)) & 
                         (data['winning_outcome'] == 'Draw')])
        
        # Performance by odds ranges
        home_fav_wins = len(home_matches[(home_matches['home_odds'] < 2.0) & 
                                        (home_matches['winning_outcome'] == 'Home')])
        home_fav_total = len(home_matches[home_matches['home_odds'] < 2.0])
        
        away_fav_wins = len(away_matches[(away_matches['away_odds'] < 2.0) & 
                                        (away_matches['winning_outcome'] == 'Away')])
        away_fav_total = len(away_matches[away_matches['away_odds'] < 2.0])
        
        team_stats[team] = {
            'home_win_rate': home_wins / max(home_total, 1),
            'away_win_rate': away_wins / max(away_total, 1),
            'overall_win_rate': total_wins / max(total_matches, 1),
            'draw_rate': draws / max(total_matches, 1),
            'home_fav_win_rate': home_fav_wins / max(home_fav_total, 1),
            'away_fav_win_rate': away_fav_wins / max(away_fav_total, 1),
            'total_matches': total_matches
        }
    
    return team_stats


In [40]:
team_stats = enhanced_team_performance(data)

In [41]:
def add_team_features(data, team_stats):
    """Add team-specific features to the dataset"""
    
    def get_team_features(row):
        home_team = row['home_team']
        away_team = row['away_team']
        
        home_stats = team_stats.get(home_team, {})
        away_stats = team_stats.get(away_team, {})
        
        return pd.Series({
            'home_team_win_rate': home_stats.get('home_win_rate', 0.5),
            'away_team_win_rate': away_stats.get('away_win_rate', 0.3),
            'home_team_overall_rate': home_stats.get('overall_win_rate', 0.4),
            'away_team_overall_rate': away_stats.get('overall_win_rate', 0.4),
            'home_team_draw_rate': home_stats.get('draw_rate', 0.3),
            'away_team_draw_rate': away_stats.get('draw_rate', 0.3),
            'team_strength_diff': home_stats.get('overall_win_rate', 0.4) - away_stats.get('overall_win_rate', 0.4),
            'home_experience': min(home_stats.get('total_matches', 0) / 100, 1),
            'away_experience': min(away_stats.get('total_matches', 0) / 100, 1)
        })
    
    team_features = data.apply(get_team_features, axis=1)
    return pd.concat([data, team_features], axis=1)

In [42]:
data = add_team_features(data, team_stats)

In [43]:
# Improved model training function
def train_improved_model(X, y, use_smote=True, balance_classes=True):
    """Train model with various improvements"""
    
    # Handle class imbalance
    if balance_classes:
        # Calculate class weights
        classes = np.unique(y)
        class_weights = compute_class_weight('balanced', classes=classes, y=y)
        class_weight_dict = dict(zip(classes, class_weights))
        print(f"Class weights: {class_weight_dict}")
    else:
        class_weight_dict = None
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Apply SMOTE if requested
    if use_smote:
        smote = SMOTE(random_state=42)
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
        print(f"Original training set: {len(X_train)}")
        print(f"SMOTE balanced set: {len(X_train_balanced)}")
    else:
        X_train_balanced, y_train_balanced = X_train, y_train
    
    # Model configurations to try
    models = {
        'HistGradientBoosting': HistGradientBoostingClassifier(
            max_iter=300,
            max_depth=6,
            min_samples_leaf=10,
            learning_rate=0.1,
            random_state=42,
            early_stopping=True,
            validation_fraction=0.2,
            class_weight='balanced'
        ),
        'RandomForest': RandomForestClassifier(
            n_estimators=200,
            max_depth=8,
            min_samples_split=10,
            min_samples_leaf=5,
            random_state=42,
            class_weight='balanced'
        )
    }
    
    best_model = None
    best_score = 0
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_balanced, y_train_balanced, 
                                   cv=5, scoring='accuracy')
        
        # Train on full training set
        model.fit(X_train_balanced, y_train_balanced)
        
        # Test predictions
        y_pred = model.predict(X_test)
        
        # Store results
        results[name] = {
            'model': model,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'test_predictions': y_pred,
            'confusion_matrix': confusion_matrix(y_test, y_pred),
            'classification_report': classification_report(y_test, y_pred)
        }
        
        print(f"CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
        
        if cv_scores.mean() > best_score:
            best_score = cv_scores.mean()
            best_model = model
    
    return best_model, results, X_test, y_test


In [44]:
# Define features (adjust based on what you want to include)
feature_cols = [
    'home_odds', 'draw_odds', 'away_odds',
    'home_prob', 'draw_prob', 'away_prob', 'overround',
    'home_away_odds_ratio', 'odds_spread', 'favorite_odds', 'underdog_odds',
    'odds_variance', 'market_confidence', 'draw_bias', 'home_advantage',
    'match_competitiveness', 'home_team_win_rate', 'away_team_win_rate',
    'home_team_overall_rate', 'away_team_overall_rate', 'home_team_draw_rate',
    'away_team_draw_rate', 'team_strength_diff', 'home_experience', 'away_experience'
]
    
# Handle categorical features
data_encoded = pd.get_dummies(data, columns=['home_odds_category', 'away_odds_category'])
    
X = data_encoded[feature_cols + [col for col in data_encoded.columns if 'odds_category' in col]]
y = data['winning_outcome']

best_model, results, X_test, y_test = train_improved_model(X, y, use_smote=True, balance_classes=True)

Class weights: {'Away': np.float64(1.0435779816513762), 'Draw': np.float64(1.4115822130299898), 'Home': np.float64(0.75)}
Original training set: 3276
SMOTE balanced set: 4368

Training HistGradientBoosting...
CV Score: 0.568 (+/- 0.111)

Training RandomForest...
CV Score: 0.554 (+/- 0.054)


In [45]:

# Enhanced prediction function
def predict_match_enhanced(model, team_stats, home_team, away_team, home_odds, draw_odds, away_odds):
    """Enhanced prediction with all new features"""
    
    # Create a temporary dataframe with the match
    temp_data = pd.DataFrame({
        'home_team': [home_team],
        'away_team': [away_team],
        'home_odds': [home_odds],
        'draw_odds': [draw_odds],
        'away_odds': [away_odds]
    })
    
    # Add advanced features
    temp_data = create_advanced_features(temp_data)
    temp_data = add_team_features(temp_data, team_stats)
    
    # Select features (adjust based on your feature selection)
    feature_cols = [
        'home_odds', 'draw_odds', 'away_odds',
        'home_prob', 'draw_prob', 'away_prob', 'overround',
        'home_away_odds_ratio', 'odds_spread', 'favorite_odds', 'underdog_odds',
        'odds_variance', 'market_confidence', 'draw_bias', 'home_advantage',
        'match_competitiveness', 'home_team_win_rate', 'away_team_win_rate',
        'home_team_overall_rate', 'away_team_overall_rate', 'home_team_draw_rate',
        'away_team_draw_rate', 'team_strength_diff', 'home_experience', 'away_experience'
    ]
    
    # Handle categorical features
    categorical_cols = ['home_odds_category', 'away_odds_category']
    temp_data_encoded = pd.get_dummies(temp_data, columns=categorical_cols, prefix=categorical_cols)
    
    # Ensure all feature columns exist
    for col in feature_cols:
        if col not in temp_data_encoded.columns:
            temp_data_encoded[col] = 0
    
    # Add dummy categorical columns if they don't exist
    for cat_col in categorical_cols:
        for category in ['Heavy_Favorite', 'Favorite', 'Slight_Favorite', 'Underdog']:
            dummy_col = f"{cat_col}_{category}"
            if dummy_col not in temp_data_encoded.columns:
                temp_data_encoded[dummy_col] = 0
    
    # Update feature list to include categorical dummies
    all_features = feature_cols + [f"{cat_col}_{cat}" for cat_col in categorical_cols 
                                  for cat in ['Heavy_Favorite', 'Favorite', 'Slight_Favorite', 'Underdog']]
    
    X_pred = temp_data_encoded[all_features]
    
    # Get prediction and probabilities
    prediction = model.predict(X_pred)[0]
    probabilities = model.predict_proba(X_pred)[0]
    
    # Create probability dictionary
    classes = model.classes_
    prob_dict = {class_name: prob for class_name, prob in zip(classes, probabilities)}
    
    return prediction, prob_dict

In [51]:

# Make predictions on the test set
home_team = "Leeds"
away_team = "Everton"
home_odds = 2.63
draw_odds = 3.47
away_odds = 2.80
prediction, probs = predict_match_enhanced(best_model, team_stats, home_team, away_team, home_odds, draw_odds, away_odds)
print(f"Predicted outcome: {prediction}")
# print(f"Probabilities: Home={probs[0]:.2f}, Draw={probs[1]:.2f}, Away={probs[2]:.2f}")

Predicted outcome: Draw
