In [7]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

In [8]:
def impute_numerical_by_delete(df):
    df_cleaned = df.copy()
    numerical_cols = df_cleaned.select_dtypes(include=np.number).columns
    df_cleaned = df_cleaned.dropna(subset=numerical_cols)
    return df_cleaned


def impute_numerical_by_mean(df):
    df_imputed = df.copy()
    imputer = SimpleImputer(strategy='mean')
    df_imputed[df_imputed.select_dtypes(include='number').columns] = imputer.fit_transform(
        df_imputed.select_dtypes(include='number'))
    return df_imputed


def impute_numerical_by_median(df):
    df_imputed = df.copy()
    imputer = SimpleImputer(strategy='median')
    df_imputed[df_imputed.select_dtypes(include='number').columns] = imputer.fit_transform(
        df_imputed.select_dtypes(include='number'))
    return df_imputed


def impute_numerical_by_knn(df):
    df_imputed = df.copy()
    imputer = KNNImputer(n_neighbors=5)
    df_imputed[df_imputed.select_dtypes(include='number').columns] = imputer.fit_transform(
        df_imputed.select_dtypes(include='number'))
    return df_imputed


def impute_numerical_by_multiple(df):
    df_imputed = df.copy()
    imputer = IterativeImputer()
    df_imputed[df_imputed.select_dtypes(include='number').columns] = imputer.fit_transform(
        df_imputed.select_dtypes(include='number'))
    return df_imputed


def impute_numerical_by_decision_tree(df):
    df_imputed = df.copy()
    numerical_columns = df_imputed.select_dtypes(include='number').columns
    categorical_columns = df_imputed.select_dtypes(include='object').columns

    for col in numerical_columns:
        if df_imputed[col].isnull().sum() > 0:
            # Separate known and unknown data
            known_data = df_imputed[df_imputed[col].notna()]
            unknown_data = df_imputed[df_imputed[col].isna()]

            if not unknown_data.empty:
                # Convert categorical columns to dummies
                known_data_dummies = pd.get_dummies(known_data, columns=categorical_columns, drop_first=False)
                unknown_data_dummies = pd.get_dummies(unknown_data, columns=categorical_columns, drop_first=False)

                # Align the columns to ensure they match
                known_data_dummies, unknown_data_dummies = known_data_dummies.align(unknown_data_dummies, join='left', axis=1, fill_value=0)

                # Train the model on known data
                model = DecisionTreeRegressor()
                model.fit(known_data_dummies.drop(columns=[col]), known_data[col])

                # Predict missing values and replace them
                df_imputed.loc[df_imputed[col].isna(), col] = model.predict(unknown_data_dummies.drop(columns=[col]))

    return df_imputed


# Imputation pour les données catégoriques
def impute_categorical_by_delete(df):
    df_cleaned = df.copy()
    categorical_cols = df_cleaned.select_dtypes(include='object').columns
    df_cleaned = df_cleaned.dropna(subset=categorical_cols)
    return df_cleaned


def impute_categorical_by_mode(df):
    df_imputed = df.copy()
    imputer = SimpleImputer(strategy='most_frequent')
    df_imputed[df_imputed.select_dtypes(include='object').columns] = imputer.fit_transform(
        df_imputed.select_dtypes(include='object'))
    return df_imputed


def impute_categorical_by_new_category(df):
    df_imputed = df.copy()
    df_imputed = df_imputed.fillna('Missing')
    return df_imputed


def impute_categorical_by_hot_deck(df):
    df_imputed = df.copy()
    for col in df_imputed.columns:
        missing_indices = df_imputed[col].isna()
        if missing_indices.any():
            available_values = df_imputed[col].dropna().values
            imputed_values = np.random.choice(available_values, size=missing_indices.sum())
            df_imputed.loc[missing_indices, col] = imputed_values
    return df_imputed


def impute_categorical_by_decision_tree(df):
    df_imputed = df.copy()

    object_columns = df_imputed.select_dtypes(include=['object']).columns

    for col in object_columns:
        if df_imputed[col].isnull().sum() > 0:
            # Split the DataFrame into two sets: with and without missing values for the current column
            df_missing = df_imputed[df_imputed[col].isnull()]
            df_not_missing = df_imputed[~df_imputed[col].isnull()]

            # If all values are missing, we cannot predict anything
            if df_not_missing.empty:
                continue

            # Define features (X) and target (y)
            X = df_not_missing.drop(columns=[col])
            y = df_not_missing[col].astype(str)  # Ensure the target is categorical

            # Encode the remaining object columns
            X = pd.get_dummies(X, drop_first=True)

            # Encode the missing set and align columns
            X_missing = pd.get_dummies(df_missing.drop(columns=[col]), drop_first=True)

            # Align columns to ensure consistency
            X, X_missing = X.align(X_missing, join='left', axis=1, fill_value=0)

            # Ensure there are no duplicated columns
            X = X.loc[:, ~X.columns.duplicated()]
            X_missing = X_missing.loc[:, ~X_missing.columns.duplicated()]

            # Create and train the model
            model = DecisionTreeClassifier(random_state=42)
            model.fit(X, y)

            # Predict the missing values
            df_imputed.loc[df_imputed[col].isnull(), col] = model.predict(X_missing)

    return df_imputed

## CHARGEMENTS DES DONNEES

In [3]:
# Chargement des DataSets
fps_df = pd.read_csv('../../../Data/Regression/fps in video games/fps-in-video-games_clean.csv', )
heart_df = pd.read_csv('../../../Data/Classification/Indicators of Heart Disease/heart_2022_no_nans_clean.csv')
retail_df = pd.read_csv('../../../Data/Clustering/retail Data/retail_data_clean.csv')

  fps_df = pd.read_csv('../../../Data/Regression/fps in video games/fps-in-video-games_clean.csv')


## SUPPRESSION DES DONNEES

In [4]:
def introduce_missing_values_mcar(df, missing_percentages, target=None):
    # Vérifie que les pourcentages sont valides
    if not all(0 <= p <= 1 for p in missing_percentages):
        raise ValueError("Les pourcentages doivent être entre 0 et 1.")

    # Si une target est spécifiée, vérifie que la colonne existe dans le DataFrame
    if target and target not in df.columns:
        raise ValueError(f"La colonne target '{target}' n'existe pas dans le DataFrame.")

    df_dict = {}

    for percentage in missing_percentages:
        df_missing = df.copy()
        # Pour chaque colonne du DataFrame
        for col in df.columns:
            # Si une target est spécifiée, on exclut cette colonne
            if target and col == target:
                continue  # On saute la colonne target
            # Calcule le nombre de valeurs à remplacer par NaN
            n_missing = int(np.floor(percentage * df.shape[0]))
            # Sélectionne aléatoirement les index des valeurs à remplacer
            missing_indices = np.random.choice(df.index, n_missing, replace=False)
            # Remplace les valeurs par NaN
            df_missing.loc[missing_indices, col] = np.nan

        df_dict[f"{int(percentage * 100)}%"] = df_missing

    return df_dict

In [5]:
# Liste des pourcentages de valeurs manquantes à introduire
missing_percentages = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Génération des DataFrames avec valeurs manquantes
fps_df_with_missing = introduce_missing_values_mcar(fps_df, missing_percentages, 'FPS')
heart_df_with_missing = introduce_missing_values_mcar(heart_df, missing_percentages, 'HadHeartAttack')
retail_df_with_missing = introduce_missing_values_mcar(retail_df, missing_percentages)

In [6]:
# Enregistrement des DataFrames générés
for key, value in fps_df_with_missing.items():
    value.to_csv(f"../../../Data/Regression/fps in video games/Completeness/NaN/fps_{key}.csv", index=False)

for key, value in heart_df_with_missing.items():
    value.to_csv(f"../../../Data/Classification/Indicators of Heart Disease/Completeness/NaN/heart_{key}.csv", index=False)

for key, value in retail_df_with_missing.items():
    value.to_csv(f"../../../Data/Clustering/retail Data/Completeness/NaN/retail_{key}.csv", index=False)

## Correction des données avec differentes Strategies

In [9]:
def impute_dataset(df, strategy):
    df_imputed = df.copy()
    # Impute numerical columns
    if strategy['numerical'] == 'delete':
        df_imputed = impute_numerical_by_delete(df_imputed)
    elif strategy['numerical'] == 'mean':
        df_imputed = impute_numerical_by_mean(df_imputed)
    elif strategy['numerical'] == 'median':
        df_imputed = impute_numerical_by_median(df_imputed)
    elif strategy['numerical'] == 'knn':
        df_imputed = impute_numerical_by_knn(df_imputed)
    elif strategy['numerical'] == 'multiple':
        df_imputed = impute_numerical_by_multiple(df_imputed)
    elif strategy['numerical'] == 'decision_tree':
        df_imputed = impute_numerical_by_decision_tree(df_imputed)
    else:
        raise ValueError("Unknown numerical imputation strategy")

    # Impute categorical columns
    if strategy['categorical'] == 'delete':
        df_imputed = impute_categorical_by_delete(df_imputed)
    elif strategy['categorical'] == 'mode':
        df_imputed = impute_categorical_by_mode(df_imputed)
    elif strategy['categorical'] == 'new':
        df_imputed = impute_categorical_by_new_category(df_imputed)
    elif strategy['categorical'] == 'hot_deck':
        df_imputed = impute_categorical_by_hot_deck(df_imputed)
    elif strategy['categorical'] == 'decision_tree':
        df_imputed = impute_categorical_by_decision_tree(df_imputed)
    else:
        raise ValueError("Unknown categorical imputation strategy")

    return df_imputed

# Strategy :
strategy_delete = {
    'numerical': 'delete',
    'categorical': 'delete'
}

strategy_mean_mode = {
    'numerical': 'mean',
    'categorical': 'mode'
}

strategy_median_new = {
    'numerical': 'median',
    'categorical': 'new'
}

strategy_decision_tree = {
    'numerical': 'decision_tree',
    'categorical': 'decision_tree'
}

strategy_mean_new = {
    'numerical': 'mean',
    'categorical': 'new'
}

strategy_knn_mode = {
    'numerical': 'knn',
    'categorical': 'mode'
}

In [13]:
def process_files(base_path, file_prefix, missing_percentages, strategies):
    for percentage in missing_percentages:
        # Lire le fichier CSV
        df = pd.read_csv(f"{base_path}/NaN/{file_prefix}_{int(percentage * 100)}%.csv")
        print(percentage)
        # Appliquer chaque stratégie d'imputation et sauvegarder les résultats
        for strategy_name, strategy_func, subfolder in strategies:
            df_imputed = impute_dataset(df, strategy_func)
            df_imputed.to_csv(f"{base_path}/{subfolder}/{file_prefix}_{int(percentage * 100)}%.csv")

# Les stratégies d'imputation, définies sous forme de tuples avec le nom de la stratégie, la fonction correspondante et le sous-dossier où enregistrer le fichier
strategies = [
    # ("Delete lines", strategy_delete, "Delete lines"),
    # ("Mean and Mode", strategy_mean_mode, "Mean and Mode"),
    # ("Median and New", strategy_median_new, "Median and New"),
    # ("Decision Tree", strategy_decision_tree, "Decision Tree"),
    # ("Mean and New", strategy_mean_new, "Mean and New")
    ("KNN and Mode", strategy_knn_mode, "KNN and Mode")
]

missing_percentages = [0, 0.1, 0.2, 0.3]

# Chemins de base pour les différents ensembles de données
heart_base_path = "../../../Data/Classification/Indicators of Heart Disease/Completeness"
retail_base_path = "../../../Data/Clustering/retail Data/Completeness"
fps_base_path = "../../../Data/Regression/fps in video games/Completeness"

# Appeler la fonction pour chaque ensemble de données
# process_files(heart_base_path, "heart", missing_percentages, strategies)
# process_files(retail_base_path, "retail", missing_percentages, strategies)
process_files(fps_base_path, "fps", missing_percentages, strategies)

  df = pd.read_csv(f"{base_path}/NaN/{file_prefix}_{int(percentage * 100)}%.csv")


0


  df = pd.read_csv(f"{base_path}/NaN/{file_prefix}_{int(percentage * 100)}%.csv")


0.1


KeyboardInterrupt: 