# Import

In [41]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from scipy.spatial import distance
from scipy.stats import mode

# Load dataset

In [42]:
excel = pd.read_csv('../Data/case1Data.csv')

In [43]:
y = excel.iloc[:, 0]
df = excel.iloc[:, 1:]

# NaN analysis

In [44]:
# NaNs per column
nan_per_column = df.isna().sum()

# NaNs per sample (row)
nan_per_sample = df.isna().sum(axis=1)

# Summary statistics
column_stats = nan_per_column.describe()  # Summary for columns
sample_stats = nan_per_sample.describe()  # Summary for samples

# Print results
print("Missing Values Per Column - Summary:")
print(column_stats)

print("\nMissing Values Per Sample - Summary:")
print(sample_stats)

Missing Values Per Column - Summary:
count    100.000000
mean      14.890000
std        3.959122
min        6.000000
25%       12.000000
50%       14.000000
75%       18.000000
max       27.000000
dtype: float64

Missing Values Per Sample - Summary:
count    100.000000
mean      14.890000
std        4.019887
min        6.000000
25%       12.000000
50%       15.000000
75%       17.250000
max       32.000000
dtype: float64


### More detailed Analysis on the categorical variables

In [45]:
# Identify categorical columns (last 5 columns)
categorical_cols = df.columns[-5:]

# Find rows where at least one categorical column has NaN
rows_with_nan_cat = df[df[categorical_cols].isna().any(axis=1)]

# NaNs per column
nan_per_column_cat = df[categorical_cols].isna().sum()

# NaNs per sample
nan_per_sample_cat = df[categorical_cols].isna().sum(axis=1)

# Summary statistics
column_stats_cat = nan_per_column_cat.describe()  # Categorical columns
sample_stats_cat = nan_per_sample_cat.describe()  # Per sample (categorical)

# Print results
print("\nMissing Values Per Categorical Column - Summary:")
print(column_stats_cat)

print("\nMissing Values Per Sample (Categorical Data) - Summary:")
print(sample_stats_cat)

print("\nSamples with at least one NaN in categorical variables:")
print(rows_with_nan_cat.iloc[:, 95:])


Missing Values Per Categorical Column - Summary:
count     5.0
mean     22.0
std       0.0
min      22.0
25%      22.0
50%      22.0
75%      22.0
max      22.0
dtype: float64

Missing Values Per Sample (Categorical Data) - Summary:
count    100.000000
mean       1.100000
std        2.081666
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        5.000000
dtype: float64

Samples with at least one NaN in categorical variables:
    C_01  C_02  C_03  C_04  C_05
8    NaN   NaN   NaN   NaN   NaN
10   NaN   NaN   NaN   NaN   NaN
14   NaN   NaN   NaN   NaN   NaN
16   NaN   NaN   NaN   NaN   NaN
21   NaN   NaN   NaN   NaN   NaN
24   NaN   NaN   NaN   NaN   NaN
28   NaN   NaN   NaN   NaN   NaN
30   NaN   NaN   NaN   NaN   NaN
34   NaN   NaN   NaN   NaN   NaN
40   NaN   NaN   NaN   NaN   NaN
57   NaN   NaN   NaN   NaN   NaN
58   NaN   NaN   NaN   NaN   NaN
59   NaN   NaN   NaN   NaN   NaN
62   NaN   NaN   NaN   NaN   NaN
64   NaN   NaN   NaN   NaN   NaN
66   N

# Value imputation using KNN

Separate numerical and categorical variables to do different imputation methods on each one of those

In [46]:
# Identify numerical and categorical columns
numerical_cols = df.columns[:-5]  # First 95 columns are numerical
categorical_cols = df.columns[-5:]  # Last 5 columns are categorical

# Split numerical and categorical data
df_numerical = df[numerical_cols]
df_categorical = df[categorical_cols]

Use _euclidean distance_ for imputing the numerical variables. First idea was to use impute missing values in categorical variables using KNN and _hamming distance_, but the smaples with NaN on the categorical variables have all the five variables as NaN, so it was not possible. For that reason, first we are imputing on the numerical variables, then joining again the data, and then doing a KNN for imputing the rest.

In [52]:
# Impute Numerical Data with Euclidean Distance
imputer_num = KNNImputer(n_neighbors=5, metric='nan_euclidean') 
df_numerical_imputed = pd.DataFrame(imputer_num.fit_transform(df_numerical), columns=numerical_cols)

# Join imputed numerical variables with categorical variables
df_imputed = pd.concat([df_numerical_imputed, df_categorical], axis=1)

Define a function for imputing the mode

In [80]:
def knn_mode_imputation(K, X):
    """
    KNN imputation function for categorical data using mode (most frequent value) instead of mean.
    
    Parameters:
    K : int
        Number of nearest neighbors to consider.
    X : pd.DataFrame
        Input data with missing values represented as NaN.
    
    Returns:
    pd.DataFrame
        DataFrame with imputed values.
    """
    X = X.copy()  # Avoid modifying the original DataFrame
    n = len(X)
    
    for i in range(n):
        for j in range(95,100):  # Iterate over columns (features)
            if pd.isna(X.iloc[i, j]):  # If value is NaN, we need to impute it
                print("NaN value at positions:", i,",", j)
                distances = []
                valid_indices = []
                
                # Compute distances between the current row and all other rows (excluding NaN columns)
                for k in range(n):
                    if k != i and not X.iloc[k].isna().any():  # Skip itself and rows with NaNs
                        dist = distance.euclidean(X.iloc[i].fillna(0), X.iloc[k].fillna(0))
                        distances.append((dist, k))
                        valid_indices.append(k)
                
                if len(valid_indices) > 0:
                    # Get the K nearest neighbors
                    distances.sort()
                    nearest_indices = [idx for _, idx in distances[:K]]
                    # Find the most frequent value (mode) among the nearest neighbors
                    neighbor_values = X.iloc[nearest_indices, j].dropna()
                    print("nearest indices values:", X.iloc[nearest_indices, j])
                    if not neighbor_values.empty:
                        most_frequent = mode(neighbor_values)[0]
                        X.iloc[i, j] = most_frequent
                        continue
                
                # If all neighbors are NaN, use the mode of the whole column
                col_mode = X.iloc[:, j].dropna().mode()
                if not col_mode.empty:
                    X.iloc[i, j] = col_mode[0]
                print("imputed value is:", X.iloc[i, j])
    
    return X

df_full_imputed = knn_mode_imputation(K=5, X=df_imputed)

NaN value at positions: 8 , 95
nearest indices values: 32    71.0
46    71.0
48    73.0
3     71.0
17    72.0
Name: C_01, dtype: float64
NaN value at positions: 8 , 96
nearest indices values: 48    72.0
32    72.0
46    72.0
75    72.0
3     72.0
Name: C_02, dtype: float64
NaN value at positions: 8 , 97
nearest indices values: 48    71.0
32    72.0
46    71.0
75    72.0
3     73.0
Name: C_03, dtype: float64
NaN value at positions: 8 , 98
nearest indices values: 32    72.0
48    72.0
17    73.0
39    71.0
3     71.0
Name: C_04, dtype: float64
NaN value at positions: 8 , 99
nearest indices values: 32    72.0
46    71.0
17    72.0
48    71.0
13    73.0
Name: C_05, dtype: float64
NaN value at positions: 10 , 95
nearest indices values: 8     71.0
32    71.0
48    73.0
46    71.0
77    71.0
Name: C_01, dtype: float64
NaN value at positions: 10 , 96
nearest indices values: 48    72.0
8     72.0
75    72.0
32    72.0
46    72.0
Name: C_02, dtype: float64
NaN value at positions: 10 , 97
nearest

Convert to One-Hot-Enconding the categorical variables

In [83]:
df_numeric = df_full_imputed.iloc[:, :95]  # Extract numerical variables
df_categorical = df_full_imputed.iloc[:, 95:]  # Extract categorical variables

# One-Hot Encoding
df_categorical_encoded = pd.get_dummies(df_categorical, drop_first=True)

# Combine on a final df
df_final = pd.concat([df_numeric, df_categorical_encoded], axis=1)

# save the final df
df_final.to_csv('../Data/preprocessed_dataframes/data_5KNN.csv', index=False)