In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Load the dataset
all_df = pd.read_excel('trainDataset.xls')

# Drop the ID Column
all_df.drop('ID', axis=1, inplace=True)

---
## 1. Dealing with Missing Value

#### Calculate the number of cell containing missing value

In [3]:
# Check the distribution of pCR (outcome)
total_missing_cell = 0

for (index, column) in enumerate(all_df):
    total_missing_cell += len(all_df.loc[(all_df[column] == 999)])
    
print(f"Number of total missing value: {total_missing_cell}")

Number of total missing value: 17


#### Construct an array to keep track of the missing value's location

In [4]:
missing_data = [all_df.index[all_df[column] == 999].tolist() for index, column in enumerate(all_df)]

### 1.1 KNN Imputation
Imputation for completing missing values using k-Nearest Neighbors. Each sample’s missing values are imputed using the mean value from `n_neighbors` nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close.
<br><br>
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html?highlight=impute

In [5]:
from sklearn.impute import KNNImputer

KNN_imputer = KNNImputer(n_neighbors=5, missing_values = 999)
KNN_df = KNN_imputer.fit_transform(all_df)

In [6]:
# Round the imputed value to its closest category
for row_index, row in enumerate(missing_data):
    for col_index, item in enumerate(row):
        KNN_df[item][row_index] = round(KNN_df[item][row_index])
        print(KNN_df[item][row_index])

0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
2.0
2.0
3.0
2.0
1.0
1.0
1.0
1.0
1.0


### 1.2 Multivariate imputer (Iterative imputer)
Multivariate imputer that estimates each feature from all the others. A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion.
<br> <br>
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html?highlight=impute#sklearn.impute.IterativeImputer

In [7]:
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

mean_imputer = IterativeImputer(random_state=0, missing_values = 999)
mean_imputer.fit(all_df)
mean_df = mean_imputer.transform(all_df)

In [8]:
# Round the imputed value to its closest category
for row_index, row in enumerate(missing_data):
    for col_index, item in enumerate(row):
        mean_df[item][row_index] = round(mean_df[item][row_index])
        print(mean_df[item][row_index])

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2.0
2.0
3.0
2.0
2.0
1.0
1.0
1.0
0.0


In [11]:
column_headers = list(all_df.columns.values)

# Download the newly created dataframe for observation purposes
KNN_df = pd.DataFrame(KNN_df)
KNN_df.columns = column_headers
KNN_df.to_excel('knn_df.xlsx', sheet_name='Sheet 1', index=False)

mean_df = pd.DataFrame(mean_df)
mean_df.columns = column_headers
mean_df.to_excel('mean_df.xlsx', sheet_name='Sheet 1', index=False)

---
## 2. Feature normalisation