# Model Training and Evaluation for Goals Prediction

This code trains several machine learning models to predict player goals based on historical data, and selects the best model based on Mean Squared Error (MSE).

### Key Steps:
1. **Data Loading and Preprocessing**: 
   - The dataset is loaded, and the 'Age' column is dropped. Features (`X`) include player stats, while the target (`y`) is goals.
   - Categorical columns ('Player', 'Squad', 'Comp') are processed using **OneHotEncoder** for linear models and **LabelEncoder** for tree-based models and SVR.

2. **Model Training**: 
   - The dataset is split into training and test sets.
   - Various models, including Linear Regression, Ridge, Random Forest, and XGBoost, are trained and evaluated using MSE.

3. **Model Selection and Saving**: 
   - The model with the lowest MSE is selected and retrained on the entire dataset.
   - The best model is saved to a file (`best_model.pkl`) for future use.

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
import numpy as np
import pickle
from xgboost import XGBRegressor

# Load your dataset
file_path = 'training_data_with_goals_target.csv'
df = pd.read_csv(file_path)

# Drop the "Age" column
df = df.drop(columns=["Age"])

# Define the features and target
X = df.drop(columns=["target"])
y = df["target"]

# Define which columns are categorical
categorical_columns = ["Player", "Squad", "Comp"]

# OneHotEncoder for linear models
onehot_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'  # Keep the other numeric columns as they are
)

# LabelEncoder for tree-based models
# Apply LabelEncoder for categorical columns
label_encoded_X = X.copy()
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    label_encoded_X[col] = le.fit_transform(label_encoded_X[col])
    label_encoders[col] = le  # Save the encoder for potential future use

# Split the data into train and test sets for both one-hot and label encoded data
X_train_onehot, X_test_onehot, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_label, X_test_label, _, _ = train_test_split(label_encoded_X, y, test_size=0.2, random_state=42)

# StandardScaler for models that require scaling (e.g., SVR)
scaler = StandardScaler()

# Define models to evaluate
models = {
    'Linear Regression': Pipeline(steps=[('preprocessor', onehot_preprocessor), ('regressor', LinearRegression())]),
    'Ridge Regression': Pipeline(steps=[('preprocessor', onehot_preprocessor), ('regressor', Ridge())]),
    'Lasso Regression': Pipeline(steps=[('preprocessor', onehot_preprocessor), ('regressor', Lasso())]),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'SVR': Pipeline(steps=[('scaler', scaler), ('regressor', SVR())])  # SVR with scaling
}

# Dictionary to store the results
results = {}

# Train and evaluate the models
for model_name, model in models.items():
    if model_name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression']:  # Linear models need OneHotEncoded data
        model.fit(X_train_onehot, y_train)
        y_pred = model.predict(X_test_onehot)
    elif model_name == 'SVR':  # SVR needs scaled numerical data
        model.fit(X_train_label, y_train)  # SVR works with label encoded data and scaling
        y_pred = model.predict(X_test_label)
    else:  # Tree-based models and XGBoost work with label encoded data
        model.fit(X_train_label, y_train)
        y_pred = model.predict(X_test_label)
    
    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    results[model_name] = mse
    print(f"{model_name} - Mean Squared Error: {mse}")

# Identify the best model based on lowest MSE
best_model_name = min(results, key=results.get)
print(f"\nBest Model: {best_model_name} with Mean Squared Error: {results[best_model_name]}")

# Save the best model (retrain on the full dataset for final use)
if best_model_name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression']:
    best_model = clone(models[best_model_name])
    best_model.fit(X, y)  # Fit on the entire dataset with OneHotEncoding
elif best_model_name == 'SVR':
    best_model = clone(models[best_model_name])
    best_model.fit(label_encoded_X, y)  # Fit on the entire dataset with LabelEncoding and scaling
else:
    best_model = clone(models[best_model_name])
    best_model.fit(label_encoded_X, y)  # Fit on the entire dataset with LabelEncoding

# Save the best model to a file
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nThe best model '{best_model_name}' has been saved to 'best_model.pkl'")


Linear Regression - Mean Squared Error: 11.924909219150187
Ridge Regression - Mean Squared Error: 9.863551486834377
Lasso Regression - Mean Squared Error: 11.051687344007483
Decision Tree - Mean Squared Error: 15.979288770142437
Random Forest - Mean Squared Error: 10.285688657029041
Gradient Boosting - Mean Squared Error: 10.275249778360113
XGBoost - Mean Squared Error: 10.740106213522557
SVR - Mean Squared Error: 11.82464685472684

Best Model: Ridge Regression with Mean Squared Error: 9.863551486834377

The best model 'Ridge Regression' has been saved to 'best_model.pkl'


# Prediction for the Goals

In [11]:
import pandas as pd
import pickle
import numpy as np

# Load the saved Ridge regression model
with open('best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Load the new dataset
new_data_path = 'prediction_data_2024_2025.csv'
new_df = pd.read_csv(new_data_path)

# Drop the "Age" column and rows with NaN values
new_df = new_df.drop(columns=["Age"]).dropna()

# Extract the necessary columns for processing and prediction
categorical_columns = ["Player", "Squad", "Comp"]

# Prepare the feature set for prediction by dropping the target column (if present) and keeping relevant features
X_new = new_df.drop(columns=['target'], errors='ignore')

# Ensure the categorical columns are treated the same way as in training
# Apply the saved OneHotEncoder for linear models (Ridge regression)
X_new_processed = best_model.named_steps['preprocessor'].transform(X_new)

# Make predictions using the best model
predictions = best_model.named_steps['regressor'].predict(X_new_processed)

# Round the predicted goals to the nearest integer
rounded_predictions = np.round(predictions)

# Add the rounded predicted goals to the original dataset
new_df['predicted Goals'] = rounded_predictions

# Keep only the necessary columns: "Player", "Squad", "Comp", and "predicted Goals"
final_df = new_df[["Player", "Squad", "Comp", "predicted Goals"]]

# Sort the dataframe by predicted goals in descending order
final_df = final_df.sort_values(by="predicted Goals", ascending=False)

# Save the final dataframe with predictions to a new CSV file
output_file_path = 'predictions.csv'  # Update this path as needed
final_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to '{output_file_path}'")


Predictions saved to 'predictions.csv'


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
import numpy as np
import pickle
from xgboost import XGBRegressor

# Load your dataset
file_path = 'training_data_with_assists_target.csv' 
df = pd.read_csv(file_path)

# Drop the "Age" column
df = df.drop(columns=["Age"])

# Define the features and target
X = df.drop(columns=["target"])
y = df["target"]

# Define which columns are categorical
categorical_columns = ["Player", "Squad", "Comp"]

# OneHotEncoder for linear models
onehot_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough' 
)

# LabelEncoder for tree-based models
# Apply LabelEncoder for categorical columns
label_encoded_X = X.copy()
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    label_encoded_X[col] = le.fit_transform(label_encoded_X[col])
    label_encoders[col] = le  # Save the encoder for potential future use

# Split the data into train and test sets for both one-hot and label encoded data
X_train_onehot, X_test_onehot, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_label, X_test_label, _, _ = train_test_split(label_encoded_X, y, test_size=0.2, random_state=42)

# StandardScaler for models that require scaling (e.g., SVR)
scaler = StandardScaler()

# Define models to evaluate
models = {
    'Linear Regression': Pipeline(steps=[('preprocessor', onehot_preprocessor), ('regressor', LinearRegression())]),
    'Ridge Regression': Pipeline(steps=[('preprocessor', onehot_preprocessor), ('regressor', Ridge())]),
    'Lasso Regression': Pipeline(steps=[('preprocessor', onehot_preprocessor), ('regressor', Lasso())]),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'SVR': Pipeline(steps=[('scaler', scaler), ('regressor', SVR())])  # SVR with scaling
}

# Dictionary to store the results
results = {}

# Train and evaluate the models
for model_name, model in models.items():
    if model_name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression']:  # Linear models need OneHotEncoded data
        model.fit(X_train_onehot, y_train)
        y_pred = model.predict(X_test_onehot)
    elif model_name == 'SVR':  # SVR needs scaled numerical data
        model.fit(X_train_label, y_train)  # SVR works with label encoded data and scaling
        y_pred = model.predict(X_test_label)
    else:  # Tree-based models and XGBoost work with label encoded data
        model.fit(X_train_label, y_train)
        y_pred = model.predict(X_test_label)
    
    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)
    results[model_name] = mse
    print(f"{model_name} - Mean Squared Error: {mse}")

# Identify the best model based on lowest MSE
best_model_name = min(results, key=results.get)
print(f"\nBest Model: {best_model_name} with Mean Squared Error: {results[best_model_name]}")

# Save the best model (retrain on the full dataset for final use)
if best_model_name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression']:
    best_model = clone(models[best_model_name])
    best_model.fit(X, y)  # Fit on the entire dataset with OneHotEncoding
elif best_model_name == 'SVR':
    best_model = clone(models[best_model_name])
    best_model.fit(label_encoded_X, y)  # Fit on the entire dataset with LabelEncoding and scaling
else:
    best_model = clone(models[best_model_name])
    best_model.fit(label_encoded_X, y)  # Fit on the entire dataset with LabelEncoding

# Save the best model to a file
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nThe best model '{best_model_name}' has been saved to 'best_model_assist.pkl'")

Linear Regression - Mean Squared Error: 3.8236537952642258
Ridge Regression - Mean Squared Error: 3.326861999400568
Lasso Regression - Mean Squared Error: 2.988557005618711
Decision Tree - Mean Squared Error: 4.89390331973031
Random Forest - Mean Squared Error: 2.9456485749146
Gradient Boosting - Mean Squared Error: 2.884988183228712
XGBoost - Mean Squared Error: 3.36557681233255
SVR - Mean Squared Error: 2.8715897972669895

Best Model: SVR with Mean Squared Error: 2.8715897972669895

The best model 'SVR' has been saved to 'best_model_assist.pkl'


# Prediction for the Assists

In [17]:
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the saved SVR model
with open('best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Load the new dataset
new_data_path = 'prediction_data_2024_2025.csv'
new_df = pd.read_csv(new_data_path)

# Drop the "Age" column and rows with NaN values
new_df = new_df.drop(columns=["Age"]).dropna()

# Extract the necessary columns for processing and prediction
categorical_columns = ["Player", "Squad", "Comp"]

# Make a copy of the original categorical columns to preserve original values
original_categorical_values = new_df[categorical_columns].copy()

# Apply Label Encoding to the categorical columns for the prediction process
label_encoded_df = new_df.copy()  # Work with a copy of the dataframe to preserve original values
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    label_encoded_df[col] = le.fit_transform(new_df[col])  # Label encode the data
    label_encoders[col] = le  # Save the encoders if needed for future use

# Prepare the feature set for prediction by dropping the target column (if present) and keeping relevant features
X_new = label_encoded_df.drop(columns=['target'], errors='ignore')

# Apply scaling as SVR requires scaled data
scaler = best_model.named_steps['scaler']
X_new_scaled = scaler.transform(X_new)

# Make predictions using the best model
predictions = best_model.named_steps['regressor'].predict(X_new_scaled)

# Round the predicted assists to the nearest integer
rounded_predictions = np.round(predictions)

# Add the rounded predicted assists to the original dataset
new_df['predicted Assists'] = rounded_predictions

# Replace the label encoded categorical columns with the original values
new_df[categorical_columns] = original_categorical_values

# Keep only the necessary columns: "Player", "Squad", "Comp", and "predicted Assists"
final_df = new_df[["Player", "Squad", "Comp", "predicted Assists"]]

# Sort the dataframe by predicted assists in descending order
final_df = final_df.sort_values(by="predicted Assists", ascending=False)

# Save the final dataframe with predictions to a new CSV file
output_file_path = 'predictions_assists.csv'  # Update this path as needed
final_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to '{output_file_path}'")


Predictions saved to 'predictions_assists.csv'
