In [3]:
import warnings
warnings.filterwarnings('error')
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated.")

import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

# Load the diabetes dataset
data = load_diabetes()

# Create a DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)

# List containing the range of the percentages of missing data
missing_data_percentages = [5, 10, 15, 20, 25]

# Number of iterations the imputation model will run for
num_iterations = 5

# Create an empty DataFrame called to store the performance metric results
results_df = pd.DataFrame(columns=["Missing Percentage", "Iteration", "MAE", "MSE", "RMSE"])

# Iterate through each missing percentage of data
for missing_percentage in missing_data_percentages:
    # Empty lists to contain the performance metric for each of the iterations
    mae_list = []
    mse_list = []
    rmse_list = []

    for _ in range(num_iterations):

        # Calculate the number of rows to remove
        rows_to_remove = int(df.shape[0] * (missing_percentage / 100))

        # Create a duplicate of the orignal DataFrame 
        df_with_missing = df.copy()

        # Iterate over each column and remove the specified percentage of the values within that column randomly
        # The same number of data points in each column will be removed
        for col in df.columns:
            random_indices = np.random.choice(df.shape[0], size=rows_to_remove, replace=False)
            df_with_missing.loc[random_indices, col] = np.nan


        # Impute missing values using KNN
        randForest_imputer = MissForest(criterion='squared_error', max_features=1.0)
        df_imputed = randForest_imputer.fit_transform(df_with_missing)

        # Convert the newly imputed values into a DataFrame
        df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

        # Calculate the MAE, MSE and RMSE between the newly imputed values and the original values
        mae = mean_absolute_error(df.values, df_imputed.values)
        mse = mean_squared_error(df.values, df_imputed.values)
        rmse = np.sqrt(mse)

        # Append the metrics to their lists
        mae_list.append(mae)
        mse_list.append(mse)
        rmse_list.append(rmse)

    # Calculate the average metrics for the current percentage of missing data in the dataset
    average_mae = np.mean(mae_list)
    average_mse = np.mean(mse_list)
    average_rmse = np.mean(rmse_list)

    # Create a DataFrame for the current percentage of missing data
    results_percentage_df = pd.DataFrame({
        "Missing Percentage": [missing_percentage] * num_iterations,
        "Iteration": range(1, num_iterations + 1),
        "MAE": mae_list,
        "MSE": mse_list,
        "RMSE": rmse_list
    })

    # Append the average metrics calculated earlier to the DataFrame
    results_percentage_df.loc[len(results_percentage_df.index)] = [missing_percentage, "Average", average_mae, average_mse, average_rmse]

    # Append results for the current percentage of missing data to the overall results DataFrame
    results_df = pd.concat([results_df, results_percentage_df], ignore_index=True)
    
    

# Display the results
print("Results:")
print(display(results_df))

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5

Unnamed: 0,Missing Percentage,Iteration,MAE,MSE,RMSE
0,5,1,0.00113,5.1e-05,0.007127
1,5,2,0.00118,5.4e-05,0.007329
2,5,3,0.001114,4.7e-05,0.006884
3,5,4,0.001077,4.2e-05,0.006518
4,5,5,0.001115,5e-05,0.007076
5,5,Average,0.001123,4.9e-05,0.006987
6,10,1,0.002469,0.000112,0.010597
7,10,2,0.002342,0.000104,0.010189
8,10,3,0.002324,0.0001,0.010004
9,10,4,0.002349,0.000106,0.010281


None
