In [5]:
# Importation des librairies
import json
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from lightgbm import LGBMRegressor

In [6]:
pd.set_option('future.no_silent_downcasting', True)

In [9]:
def read_csv_files_to_dict(base_path, dimension, file_prefix, missing_percentages, strategies=None):
    dataframes_dict = {}

    for percentage in missing_percentages:
        percentage_key = f"{int(percentage * 100)}%"

        if dimension == 'Completeness':
            dataframes_dict[percentage_key] = {}

            if strategies is not None:
                for strategy_name, strategy_func, subfolder in strategies:
                    file_path = f"{base_path}/{dimension}/{subfolder}/{file_prefix}_{percentage_key}.csv"

                    try:
                        df = pd.read_csv(file_path)
                        dataframes_dict[percentage_key][strategy_name] = df
                    except FileNotFoundError:
                        print(f"File not found: {file_path}")
                        dataframes_dict[percentage_key][strategy_name] = None
            else:
                print("No strategies provided for Completeness dimension.")
        elif dimension == 'Unicity':
            file_path = f"{base_path}/{dimension}/{file_prefix}_{percentage_key}_2x.csv"
            try:
                df = pd.read_csv(file_path)
                dataframes_dict[percentage_key] = df
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                dataframes_dict[percentage_key] = None
        else:
            file_path = f"{base_path}/{dimension}/{file_prefix}_{percentage_key}.csv"
            try:
                df = pd.read_csv(file_path)
                dataframes_dict[percentage_key] = df
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                dataframes_dict[percentage_key] = None

    return dataframes_dict

def update_json_results(output_path, model_name, pollution_percentage, results):
    # Charger le fichier JSON existant, ou initialiser une nouvelle structure si le fichier n'existe pas
    if os.path.exists(output_path):
        with open(output_path, 'r') as json_file:
            results_dict = json.load(json_file)
    else:
        results_dict = {
            "models": []
        }

    # Trouver ou ajouter l'entrée pour le modèle spécifié
    model_entry = next((model for model in results_dict["models"] if model["model"] == model_name), None)

    if not model_entry:
        model_entry = {
            "model": model_name,
            "pollution_metrics": []
        }
        results_dict["models"].append(model_entry)

    # Chercher si le pourcentage de pollution existe déjà pour ce modèle
    existing_entry = next((item for item in model_entry["pollution_metrics"] if item["pollution_percentage"] == pollution_percentage), None)

    if existing_entry:
        # Si le pourcentage de pollution existe, remplacer les métriques
        existing_entry["metrics"] = results
    else:
        # Sinon, ajouter une nouvelle entrée pour ce pourcentage
        model_entry["pollution_metrics"].append({
            "pollution_percentage": pollution_percentage,
            "metrics": results
        })

    # Écrire les résultats mis à jour dans le fichier JSON
    with open(output_path, 'w') as json_file:
        json.dump(results_dict, json_file, indent=4)

    print(f"Results saved to {output_path}")

def update_json_results_completeness(output_path, model_name, strategy, pollution_percentage, results):
    # Charger le fichier JSON existant, ou initialiser une nouvelle structure si le fichier n'existe pas
    if os.path.exists(output_path):
        with open(output_path, 'r') as json_file:
            results_dict = json.load(json_file)
    else:
        results_dict = {
            "models": [
                {
                    "model": model_name,
                    "imputation_strategies": []
                }
            ]
        }

    # Trouver ou ajouter l'entrée pour le modèle spécifié
    model_entry = next((model for model in results_dict["models"] if model["model"] == model_name), None)

    if not model_entry:
        model_entry = {
            "model": model_name,
            "imputation_strategies": []
        }
        results_dict["models"].append(model_entry)

    # Trouver ou créer l'entrée pour la stratégie d'imputation actuelle
    strategy_entry = next((item for item in model_entry["imputation_strategies"] if item["strategy"] == strategy), None)

    if not strategy_entry:
        # Ajouter une nouvelle stratégie d'imputation si elle n'existe pas encore
        strategy_entry = {
            "strategy": strategy,
            "pollution_metrics": []
        }
        model_entry["imputation_strategies"].append(strategy_entry)

    # Vérifier si le pourcentage de pollution existe déjà pour cette stratégie
    existing_entry = next((item for item in strategy_entry["pollution_metrics"] if item["pollution_percentage"] == pollution_percentage), None)

    if existing_entry:
        # Si le pourcentage de pollution existe, remplacer les métriques
        existing_entry["metrics"] = results
    else:
        # Sinon, ajouter une nouvelle entrée pour ce pourcentage
        strategy_entry["pollution_metrics"].append({
            "pollution_percentage": pollution_percentage,
            "metrics": results
        })

    # Écrire les résultats mis à jour dans le fichier JSON
    with open(output_path, 'w') as json_file:
        json.dump(results_dict, json_file, indent=4)

    print(f"Results saved to {output_path}")

def clean_column_names(df):
    df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
    return df

def prepare_fps_data(fps_df):
    fps_df['GpuOpenCL'] = fps_df['GpuOpenCL'].astype(str)
    fps_df['GpuShaderModel'] = fps_df['GpuShaderModel'].astype(str)
    fps_df = pd.get_dummies(fps_df, drop_first=True)
    
    fps_df = clean_column_names(fps_df)
    # Séparer les variables indépendantes (X) de la variable cible (y)
    X = fps_df.drop('FPS', axis=1)
    y = fps_df['FPS']

    return X, y

def train_and_evaluate(X, y):
    model = LGBMRegressor()
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        'mse': make_scorer(mean_squared_error),
        'mae': make_scorer(mean_absolute_error),
        'r2': make_scorer(r2_score)
    }
    cv_results = cross_validate(model, X, y, cv=kfold, scoring=scoring)

    results = {
        "mean_squared_error": cv_results['test_mse'].mean(),
        "mean_absolute_error": cv_results['test_mae'].mean(),
        "r2_score": cv_results['test_r2'].mean()
    }

    return results

## Completeness

In [10]:
missing_percentages = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

# Strategy :
strategy_delete = {
    'numerical': 'delete',
    'categorical': 'delete'
}

strategy_mean_mode = {
    'numerical': 'mean',
    'categorical': 'mode'
}

strategy_median_new = {
    'numerical': 'median',
    'categorical': 'new'
}

strategy_decision_tree = {
    'numerical': 'decision_tree',
    'categorical': 'decision_tree'
}

strategy_mean_new = {
    'numerical': 'mean',
    'categorical': 'new'
}

strategy_knn_mode = {
    'numerical': 'knn',
    'categorical': 'mode'
}

strategies = [
    ("Mean and Mode", strategy_mean_mode, "Mean and Mode"),
    ("Median and New", strategy_median_new, "Median and New"),
    ("Decision Tree", strategy_decision_tree, "Decision Tree"),
    ("Mean and New", strategy_mean_new, "Mean and New")
]

fps_df_dict = read_csv_files_to_dict('../../Data/Regression/fps in video games', 'Completeness', 'fps', missing_percentages, strategies)

# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/FPS in video games/Completeness.json"

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, fps_df_strategies in fps_df_dict.items():
    for strategy_name, fps_df in fps_df_strategies.items():
        X, y = prepare_fps_data(fps_df)

        results = train_and_evaluate(X, y)

        # Extraire le pourcentage de pollution à partir de la clé
        pollution_percentage = float(key.replace('%', ''))

        # Mise à jour des résultats dans le fichier JSON en utilisant la fonction
        update_json_results_completeness(output_path, "LightGBM", strategy_name, pollution_percentage, results)

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014315 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2763
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 550
[LightGBM] [Info] Start training from score 138.539805
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2770
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 548
[LightGBM] [Info] Start training from score 138.601377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

## Consistent Representation

In [11]:
pollution_percentage_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
fps_df_dict = read_csv_files_to_dict('../../Data/Regression/fps in video games', 'Consistent Representation', 'fps', pollution_percentage_levels)

# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/FPS in video games/Consistent Representation.json"

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, fps_df in fps_df_dict.items():
    X, y = prepare_fps_data(fps_df)

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = float(key.replace('%', ''))

    # Mise à jour des résultats dans le fichier JSON
    update_json_results(output_path, "LightGBM", pollution_percentage, results)

  df = pd.read_csv(file_path)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2508
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 549
[LightGBM] [Info] Start training from score 138.539805
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2515
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 547
[LightGBM] [Info] Start training from score 138.601377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

## Feature Accuracy

In [12]:
pollution_percentage_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
fps_df_dict = read_csv_files_to_dict('../../Data/Regression/fps in video games', 'Feature Accuracy', 'fps', pollution_percentage_levels)

# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/FPS in video games/Feature Accuracy.json"

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, fps_df in fps_df_dict.items():
    X, y = prepare_fps_data(fps_df)

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = float(key.replace('%', ''))

    # Mise à jour des résultats dans le fichier JSON
    update_json_results(output_path, "LightGBM", pollution_percentage, results)

  df = pd.read_csv(file_path)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015001 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2508
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 549
[LightGBM] [Info] Start training from score 138.539805
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2515
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 547
[LightGBM] [Info] Start training from score 138.601377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015513 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

## Target Accuracy

In [13]:
pollution_percentage_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
fps_df_dict = read_csv_files_to_dict('../../Data/Regression/fps in video games', 'Target Accuracy', 'fps', pollution_percentage_levels)

# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/FPS in video games/Target Accuracy.json"

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, fps_df in fps_df_dict.items():
    X, y = prepare_fps_data(fps_df)

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = float(key.replace('%', ''))

    # Mise à jour des résultats dans le fichier JSON
    update_json_results(output_path, "LightGBM", pollution_percentage, results)

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2508
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 549
[LightGBM] [Info] Start training from score 138.539805
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2515
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 547
[LightGBM] [Info] Start training from score 138.601377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

## Unicity

In [14]:
pollution_percentage_levels = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
fps_df_dict = read_csv_files_to_dict('../../Data/Regression/fps in video games', 'Unicity', 'fps', pollution_percentage_levels)

# Chemin du fichier JSON où les résultats seront enregistrés
output_path = "../../Results/FPS in video games/Unicity.json"

# Boucle sur les DataFrames pour chaque pourcentage de pollution
for key, fps_df in fps_df_dict.items():
    X, y = prepare_fps_data(fps_df)

    results = train_and_evaluate(X, y)

    # Extraire le pourcentage de pollution à partir de la clé
    pollution_percentage = float(key.replace('%', ''))

    # Mise à jour des résultats dans le fichier JSON
    update_json_results(output_path, "LightGBM", pollution_percentage, results)

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2508
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 549
[LightGBM] [Info] Start training from score 138.539805
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2515
[LightGBM] [Info] Number of data points in the train set: 340666, number of used features: 547
[LightGBM] [Info] Start training from score 138.601377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n