## Load training data

In [1]:
import pandas as pd
file_path = '../data/training_data.csv'

data = pd.read_csv(file_path, sep=';')
data.replace("NA", pd.NA, inplace=True)

data = data.map(lambda x: x.replace(',', '.') if isinstance(x, str) and x.replace(',', '') else x)

group = data['Group']
class_col = data['Class']
perform = data['Perform']

data.drop(columns=['Group', 'Class', 'Perform'], inplace=True)

### Helper functions

In [2]:
def convert_columns_to_numeric(data):
    # Convert all columns to numeric, coerce errors to convert non-numeric data to NaN
    for column in data.columns:
        data[column] = pd.to_numeric(data[column], errors='coerce')
    return data

def replace_empty_with_NA(data):
    # Replace all empty strings with NaN
    return data.replace('', pd.NA)

Please note that the data contains two distinct types of missing values that have different semantics. One corresponds to non-available/missing information and another one can be interpreted as non-applicable. In data, one type is marked by "NA" string and another is just an empty string (there is no value).

## Handling of missing Values

| Nr. | NA                  | ""                  |
|-----|---------------------|---------------------|
| 1   | `NA -> delete`      | `"" -> delete`      |
| 2   | `NA -> delete`      | `"" -> = 0`         |
| 3   | `NA -> delete`      | `"" -> mean row`    |
| 4   | `NA -> delete`      | `"" -> median row`  |
| 5   | `NA -> delete`      | `"" -> mean column` |
| 6   | `NA -> delete`      | `"" -> median column`|
| 7   | `NA -> delete`      | `"" -> KNN 3`       |
| 8   | `NA -> = 0`         | `"" -> delete`      |
| 9   | `NA -> = 0`         | `"" -> = 0`         |
| 10  | `NA -> = 0`         | `"" -> mean row`    |
| 11  | `NA -> = 0`         | `"" -> median row`  |
| 12  | `NA -> = 0`         | `"" -> mean column` |
| 13  | `NA -> = 0`         | `"" -> median column`|
| 14  | `NA -> = 0`         | `"" -> KNN 3`       |
| 15  | `NA -> mean row`    | `"" -> delete`      |
| 16  | `NA -> mean row`    | `"" -> = 0`         |
| 17  | `NA -> mean row`    | `"" -> mean row`    |
| 22  | `NA -> median row`  | `"" -> delete`      |
| 23  | `NA -> median row`  | `"" -> = 0`         |
| 25  | `NA -> median row`  | `"" -> median row`  |
| 29  | `NA -> mean column` | `"" -> delete`      |
| 30  | `NA -> mean column` | `"" -> = 0`         |
| 33  | `NA -> mean column` | `"" -> mean column` |
| 36  | `NA -> median column`| `"" -> delete`     |
| 37  | `NA -> median column`| `"" -> = 0`        |
| 41  | `NA -> median column`| `"" -> median column`|
| 43  | `NA -> KNN 3`       | `"" -> delete`      |
| 44  | `NA -> KNN 3`       | `"" -> = 0`         |
| 49  | `NA -> KNN 3`       | `"" -> KNN 3`       |
| 49  | `NA -> KNN 3`       | `"" -> KNN 3`     |
|
| 49  | `NA -> KNN 3`       | `"" -> KNN 3`  |


In [3]:
from sklearn.impute import KNNImputer

def NA_delete(data):
    # Removes all rows with NA
    data = data.dropna()
    return data

def empty_delete(data):
    # Replaces empty strings with NaN and removes them
    data = data.replace("", float('nan')).dropna()
    return data

def NA_zero(data):
    # Replaces NA with 0
    data = data.fillna(0)
    return data

def empty_zero(data):
    # Replaces empty strings with 0
    data = data.replace("", 0)
    return data

def NA_mean_row(data):
    # Replaces NA with the mean value of the respective row
    for column in data.columns:
        data[column].fillna(data[column].mean(), inplace=True)
    return data

def empty_mean_row(data):
    # Replaces empty strings with the mean value of the respective row
    for column in data.columns:
        data[column] = data[column].replace("", data[column].mean())
    return data

def NA_median_row(data):
    # Replaces NA with the median of the respective row
    for column in data.columns:
        data[column].fillna(data[column].median(), inplace=True)
    return data

def empty_median_row(data):
    # Replaces empty strings with the median of the respective row
    for column in data.columns:
        data[column] = data[column].replace("", data[column].median())
    return data

def NA_mean_column(data):
    # Replaces NA with the mean value of the respective column
    data.fillna(data.mean(), inplace=True)
    return data

def empty_mean_column(data):
    # Replaces empty strings with the mean value of the respective column
    data.replace("", data.mean(), inplace=True)
    return data

def NA_median_column(data):
    # Replaces NA with the median of the respective column
    data.fillna(data.median(), inplace=True)
    return data

def empty_median_column(data):
    # Replaces empty strings with the median of the respective column
    data.replace("", data.median(), inplace=True)
    return data

def NA_KNN3(data):
    # Replaces NA with the values of the nearest 3 neighbors
    data_imputed = KNNImputer(n_neighbors=3).fit_transform(data)
    data.loc[:, :] = data_imputed
    return data

def empty_KNN3(data):
    # Replaces empty strings with the values of the nearest 3 neighbors
    data.replace("", float('nan'), inplace=True)
    data_imputed = KNNImputer(n_neighbors=3).fit_transform(data)
    data.loc[:, :] = data_imputed
    return data


In [4]:
import os

def process_and_save_data(data):
    # Create the directory if it does not exist
    directory = "preprocess_missing_values"
    if not os.path.exists(directory):
        os.makedirs(directory)

    # List of processing functions for NA and empty strings
    na_functions = [NA_delete, NA_zero, NA_mean_row, NA_median_row, NA_mean_column, NA_median_column, NA_KNN3]
    empty_functions = [empty_delete, empty_zero, empty_mean_row, empty_median_row, empty_mean_column, empty_median_column, empty_KNN3]

    # Allowed combinations from your markdown table
    allowed_combinations = [
        (NA_delete, empty_delete), (NA_delete, empty_zero), (NA_delete, empty_mean_row),
        (NA_delete, empty_median_row), (NA_delete, empty_mean_column), (NA_delete, empty_median_column),
        (NA_delete, empty_KNN3), (NA_zero, empty_delete), (NA_zero, empty_zero),
        (NA_zero, empty_mean_row), (NA_zero, empty_median_row), (NA_zero, empty_mean_column),
        (NA_zero, empty_median_column), (NA_zero, empty_KNN3), (NA_mean_row, empty_delete),
        (NA_mean_row, empty_zero), (NA_mean_row, empty_mean_row), (NA_median_row, empty_delete),
        (NA_median_row, empty_zero), (NA_median_row, empty_median_row), (NA_mean_column, empty_delete),
        (NA_mean_column, empty_zero), (NA_mean_column, empty_mean_column), (NA_median_column, empty_delete),
        (NA_median_column, empty_zero), (NA_median_column, empty_median_column), (NA_KNN3, empty_delete),
        (NA_KNN3, empty_zero), (NA_KNN3, empty_KNN3)
    ]

    # Iterate over each allowed combination
    for na_func, empty_func in allowed_combinations:
        # Initial data copy
        processed_data = data.copy()

        # Check function type (whether it includes mean, median, or KNN)
        is_na_special = any(x in na_func.__name__ for x in ['mean', 'median', 'KNN'])
        is_empty_special = any(x in empty_func.__name__ for x in ['mean', 'median', 'KNN'])

        if is_na_special and is_empty_special:
            # Both functions are special, replace "" with NA first
            processed_data = replace_empty_with_NA(processed_data)
            processed_data = convert_columns_to_numeric(processed_data)
            processed_data = na_func(processed_data)  # Only apply the NA function
        else:
            # Determine which function to apply first
            if is_na_special:
                first_func, second_func = empty_func, na_func
            else:
                first_func, second_func = na_func, empty_func

            # Apply the first function
            processed_data = first_func(processed_data)

            # Convert "" to NA, then convert to numeric if necessary
            processed_data = replace_empty_with_NA(processed_data)
            if any(x in second_func.__name__ for x in ['mean', 'median', 'KNN']):
                processed_data = convert_columns_to_numeric(processed_data)

            # Apply the second function
            processed_data = second_func(processed_data)

        # Generate the filename based on the function names
        filename = f"training_data-{na_func.__name__}-{empty_func.__name__}.csv"
        filepath = os.path.join(directory, filename)

        # Save the processed data to a CSV file
        processed_data.to_csv(filepath, index=False)
        print(f"Saved: {filepath}")

# Example call to the function
process_and_save_data(data)


Saved: preprocess_missing_values\training_data-NA_delete-empty_delete.csv
Saved: preprocess_missing_values\training_data-NA_delete-empty_zero.csv
Saved: preprocess_missing_values\training_data-NA_delete-empty_mean_row.csv
Saved: preprocess_missing_values\training_data-NA_delete-empty_median_row.csv
Saved: preprocess_missing_values\training_data-NA_delete-empty_mean_column.csv
Saved: preprocess_missing_values\training_data-NA_delete-empty_median_column.csv
Saved: preprocess_missing_values\training_data-NA_delete-empty_KNN3.csv
Saved: preprocess_missing_values\training_data-NA_zero-empty_delete.csv
Saved: preprocess_missing_values\training_data-NA_zero-empty_zero.csv
Saved: preprocess_missing_values\training_data-NA_zero-empty_mean_row.csv
Saved: preprocess_missing_values\training_data-NA_zero-empty_median_row.csv
Saved: preprocess_missing_values\training_data-NA_zero-empty_mean_column.csv
Saved: preprocess_missing_values\training_data-NA_zero-empty_median_column.csv
Saved: preprocess_mi

## Outlier Trimming

## Class Balancing