In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import minimize
from scipy.special import gammaln
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc, mean_squared_error, r2_score


In [None]:
#Data pre processing

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load datasets
results_df = pd.read_csv('results.csv')

#Convert date ro Datetime
results_df['date'] = pd.to_datetime(results_df['date'])
results_df = results_df[results_df['date'] > '2008-01-01']

# Feature engineering: Creating goal differential
results_df['goal_differential'] = results_df['home_score'] - results_df['away_score']

# Calculate home team statistics
home_stats = results_df.groupby('home_team').agg(
    home_matches=('home_team', 'count'),
    home_wins=('goal_differential', lambda x: (x > 0).sum()),
    home_goals_scored=('home_score', 'sum'),
    home_goals_conceded=('away_score', 'sum'),
    home_goal_differential=('goal_differential', 'sum')
)

# Calculate away team statistics
away_stats = results_df.groupby('away_team').agg(
    away_matches=('away_team', 'count'),
    away_wins=('goal_differential', lambda x: (x < 0).sum()),  # Negative differential means away win
    away_goals_scored=('away_score', 'sum'),
    away_goals_conceded=('home_score', 'sum'),
    away_goal_differential=('goal_differential', lambda x: -x.sum())
)

team_stats = home_stats.join(away_stats, how='outer').fillna(0)

# Combine home and away stats into overall team stats
team_stats['total_matches'] = team_stats['home_matches'] + team_stats['away_matches']
team_stats['total_wins'] = team_stats['home_wins'] + team_stats['away_wins']
team_stats['total_goals_scored'] = team_stats['home_goals_scored'] + team_stats['away_goals_scored']
team_stats['total_goals_conceded'] = team_stats['home_goals_conceded'] + team_stats['away_goals_conceded']
team_stats['total_goal_differential'] = team_stats['home_goal_differential'] + team_stats['away_goal_differential']

# Calculate performance metrics
team_stats['win_rate'] = team_stats['total_wins'] / team_stats['total_matches']
team_stats['average_goals_per_match'] = team_stats['total_goals_scored'] / team_stats['total_matches']

# Fill missing values for safety (in case a team has no matches)
team_stats = team_stats.fillna(0)

# Target variable: Let's use a simplified performance score for now
team_stats['performance_score'] = (
    team_stats['win_rate'] * 0.6 +
    (team_stats['total_goals_scored'] / team_stats['total_goals_scored'].max()) * 0.4
)

# Reset index to make it easier to work with
team_stats = team_stats.reset_index()

# Display prepared features
print(team_stats.head())


         index  home_matches  home_wins  home_goals_scored  \
0     Abkhazia          22.0       11.0               39.0   
1  Afghanistan          37.0       17.0               49.0   
2      Albania          89.0       38.0              106.0   
3     Alderney          22.0        0.0               11.0   
4      Algeria         103.0       67.0              224.0   

   home_goals_conceded  home_goal_differential  away_matches  away_wins  \
0                 13.0                    26.0          10.0        3.0   
1                 51.0                    -2.0          64.0       15.0   
2                 89.0                    17.0          73.0       20.0   
3                 87.0                   -76.0          13.0        2.0   
4                 83.0                   141.0          86.0       42.0   

   away_goals_scored  away_goals_conceded  away_goal_differential  \
0               12.0                 13.0                    -1.0   
1               67.0                11

In [None]:
# Set the features and labels
X = team_stats[['total_matches', 'total_wins', 'win_rate', 'total_goals_scored',
                'total_goals_conceded', 'total_goal_differential', 'average_goals_per_match']]
y = team_stats['performance_score']

#Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#using standard 80 20 split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred)
r2_lr = r2_score(y_test, y_pred)

print("\n--- Linear Regression ---")
print(f"Mean Squared Error: {mse_lr:.6f}")
print(f"R-squared: {r2_lr:.6f}")

team_stats['predicted_performance_score'] = lr_model.predict(X_scaled)
team_stats['predicted_rank'] = team_stats['predicted_performance_score'].rank(ascending=False)
team_stats = team_stats.sort_values('predicted_rank')

print("\nTop 10 Teams Based on Predicted Performance Score (Linear Regression):")
print(team_stats[['index', 'predicted_performance_score', 'predicted_rank']].head(10))


np.random.seed(42)
y_train_noisy = y_train + np.random.normal(0, 0.1, size=y_train.shape)
lr_model_noisy = LinearRegression()
lr_model_noisy.fit(X_train, y_train_noisy)
y_pred_noisy = lr_model_noisy.predict(X_test)
mse_lr_noisy = mean_squared_error(y_test, y_pred_noisy)
r2_lr_noisy = r2_score(y_test, y_pred_noisy)

print("\n--- Linear Regression with Noisy Data ---")
print(f"Mean Squared Error: {mse_lr_noisy:.6f}")
print(f"R-squared: {r2_lr_noisy:.6f}")

team_stats['predicted_performance_score'] = lr_model.predict(X_scaled)
team_stats['predicted_rank'] = team_stats['predicted_performance_score'].rank(ascending=False)
team_stats = team_stats.sort_values('predicted_rank')

print("\nTop 10 Teams Based on Predicted Performance Score (Linear Regression):")
print(team_stats[['index', 'predicted_performance_score', 'predicted_rank']].head(10))


np.random.seed(42)
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("\n--- Ridge Regression ---")
print(f"Mean Squared Error: {mse_ridge:.6f}")
print(f"R-squared: {r2_ridge:.6f}")

team_stats['predicted_performance_score'] = ridge_model.predict(X_scaled)
team_stats['predicted_rank'] = team_stats['predicted_performance_score'].rank(ascending=False)
team_stats = team_stats.sort_values('predicted_rank')

print("\nTop 10 Teams Based on Predicted Performance Score (Ridge Regression):")
print(team_stats[['index', 'predicted_performance_score', 'predicted_rank']].head(10))

np.random.seed(42)
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("\n--- Lasso Regression ---")
print(f"Mean Squared Error: {mse_lasso:.6f}")
print(f"R-squared: {r2_lasso:.6f}")

team_stats['predicted_performance_score'] = lasso_model.predict(X_scaled)
team_stats['predicted_rank'] = team_stats['predicted_performance_score'].rank(ascending=False)
team_stats = team_stats.sort_values('predicted_rank')

print("\nTop 10 Teams Based on Predicted Performance Score (Lasso Regression):")
print(team_stats[['index', 'predicted_performance_score', 'predicted_rank']].head(10))



--- Linear Regression ---
Mean Squared Error: 0.000000
R-squared: 1.000000

Top 10 Teams Based on Predicted Performance Score (Linear Regression):
             index  predicted_performance_score  predicted_rank
255          Spain                     0.828947             1.0
37          Brazil                     0.760358             2.0
103        Germany                     0.752173             3.0
136          Japan                     0.747812             4.0
12       Argentina                     0.703614             5.0
185    Netherlands                     0.700501             6.0
174         Mexico                     0.697987             7.0
211       Portugal                     0.694601             8.0
84         England                     0.687903             9.0
288  United States                     0.685929            10.0

--- Linear Regression with Noisy Data ---
Mean Squared Error: 0.000070
R-squared: 0.997812

Top 10 Teams Based on Predicted Performance Score (Line