In [1]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import pandas as pd
import numpy as np

# Data Imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Feature normalisation
from sklearn.preprocessing import StandardScaler

### 1. Read the model using pickle

In [2]:
# Read the RandomForestClassification model using pickle
with open("RandomForestClassifier.pickle", "rb") as f:
    random_forest_classifier = pickle.load(f)

### 2. Read the testing dataset

In [3]:
test_df = pd.read_excel("./Dataset/testDatasetExample.xls", sheet_name="Sheet1")

### 3. Remove row containing at least 3 missing values

In [4]:
missing_values_index = np.where(test_df.isin([999]) == True)
missing_values_index = np.array(missing_values_index).tolist()
missing_values_index = set(missing_values_index[0])

list_index = []
for index in missing_values_index:
    missing_values_rows = test_df.iloc[index,:]
    missing_values_rows = np.array(missing_values_rows).tolist()
    if missing_values_rows.count(999) >= 3:
        list_index.append(index)
        
test_df = test_df.drop(list_index)

### 4. Save the 'ID' column and the features dataframe

In [5]:
# Store the 'ID' column
ID_data = test_df['ID']

# Drop the 'ID' column from testing dataset
test_df.drop('ID', axis=1, inplace=True)

### 5. Data Imputation using Multivariate imputer

In [6]:
missing_values_index = np.where(test_df.isin([999]) == True)
missing_values_index = np.array(missing_values_index)

new_df = test_df.replace(999, np.NaN)
column_names = new_df.columns

multivariate_imp = IterativeImputer(random_state=42)
multi_imputed_array = multivariate_imp.fit_transform(new_df)

# loop through all the missing values rows and columns and impute the data with round up function
for i in range(len(missing_values_index[0])):
    row = missing_values_index[0][i]
    col = missing_values_index[1][i]
    multi_imputed_array[row][col] = np.round(multi_imputed_array[row][col])

# convert to dataFrame
multi_imputed_df = pd.DataFrame(multi_imputed_array, columns=column_names)

### 6. Feature Selection on the Testing dataset

In [7]:
feature_selection_list = []
clinical_features = ['Age', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation', 'HistologyType', 'LNStatus', 'TumourStage']
image_based_features = ['original_glcm_Contrast', 'original_glcm_Idn', 'original_glcm_Idmn', 'original_glcm_Autocorrelation', 'original_glcm_Id', 'original_glcm_DifferenceAverage', 'original_ngtdm_Contrast', 'original_glcm_InverseVariance', 'original_glcm_JointEnergy', 'original_glcm_SumAverage']
feature_selection_list.extend(clinical_features)
feature_selection_list.extend(image_based_features)

feature_selected_df = multi_imputed_df[feature_selection_list]

### 7. Normalised the features

In [8]:
scaler = StandardScaler()
Xs_predict = scaler.fit_transform(feature_selected_df)

### 8. Predict/Score the testing set

In [9]:
testing_predictions = random_forest_classifier.predict(Xs_predict)

### Save the 'ID' & 'Prediction' column into a spreadsheet file (.xls)

In [10]:
target_df = pd.DataFrame({'ID': ID_data, 'pCR (outcome)': testing_predictions})
target_df.to_excel('./Dataset/testDataset.xls', sheet_name='Sheet 1', index=False)