In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
# Chargement et prétraitement des données
def load_and_preprocess_data(file_path):
    original_data = pd.read_csv(file_path, low_memory=False)
    data = original_data.copy()
    data.drop_duplicates(inplace=True)
    data.dropna(inplace=True)
    return data

In [3]:
# Ingénierie des caractéristiques
def feature_engineering(data):
    selected_features = ['name', 'position', 'team_x', 'value', 'goals_scored', 'assists', 'clean_sheets','total_points', 'yellow_cards', 'red_cards']
    data = data[selected_features].copy()
    data['total_cards'] = data['yellow_cards'] + data['red_cards']
    return data

In [4]:
# Division des données en ensembles d'entraînement et de test
def split_data(data):
    X = data.drop(columns=['goals_scored', 'assists', 'total_points'])
    y = data['total_points']
    X_encoded = pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [5]:
# Entraînement du modèle de gradient boosting
def train_gradient_boosting(X_train, y_train):
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)
    return model

In [6]:
# Évaluation du modèle
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return mse, r2, mae

In [7]:
# Chargement des données
file_path = "/Users/melusinecaillard/RSPL/data/cleaned_merged_seasons.csv"
data = load_and_preprocess_data(file_path)
data = feature_engineering(data)

In [8]:
# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = split_data(data)

In [9]:
# Entraînement du modèle de gradient boosting
model = train_gradient_boosting(X_train, y_train)

In [10]:
# Évaluation de la performance du modèle
mse, r2, mae = evaluate_model(model, X_test, y_test)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Mean Squared Error: 3.253635747493749
R-squared: 0.44884288452112187
Mean Absolute Error: 1.0143368820600736


In [11]:
# Validation croisée
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
rmse_scores = (-cv_scores)**0.5
print("Cross-validated RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())

Cross-validated RMSE scores: [1.73793187 1.7648511  1.81582438 1.78462249 1.79012545]
Mean RMSE: 1.7786710573305924
