In [10]:
import warnings

warnings.filterwarnings('ignore')
import pickle
import pandas as pd
import numpy as np

# Data Imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Feature normalisation
from sklearn.preprocessing import StandardScaler

In [11]:
!pip install xlwt




[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### 1. Read the model using pickle

In [12]:
# Read the RandomForestClassification model using pickle
with open("RandomForestRegressor.pickle", "rb") as f:
    random_forest_regressor = pickle.load(f)

### 2. Read the testing dataset

In [13]:
test_df = pd.read_excel("./Dataset/testDatasetExample.xls", sheet_name="Sheet1")

### 3. Remove row containing at least 3 missing values

In [14]:
missing_values_index = np.where(test_df.isin([999]) == True)
missing_values_index = np.array(missing_values_index).tolist()
missing_values_index = set(missing_values_index[0])

drop_index = []
for index in missing_values_index:
    missing_values_rows = test_df.iloc[index, :]
    missing_values_rows = np.array(missing_values_rows).tolist()
    if missing_values_rows.count(999) >= 3:
        drop_index.append(index)

test_df = test_df.drop(drop_index)

### 4. Save the 'ID' column and the features dataframe

In [15]:
# Store the 'ID' column
ID_data = test_df['ID']

# Drop the 'ID' column from testing dataset
test_df.drop('ID', axis=1, inplace=True)

### 5. Data imputation using Multivariate Imputer

In [16]:
missing_values_index = np.where(test_df.isin([999]) == True)
missing_values_index = np.array(missing_values_index)

new_df = test_df.replace(999, np.NaN)
column_names = new_df.columns

multivariate_imp = IterativeImputer(random_state=42)
multi_imputed_array = multivariate_imp.fit_transform(new_df)

# loop through all the missing values rows and columns and impute the data with round up function
for i in range(len(missing_values_index[0])):
    row = missing_values_index[0][i]
    col = missing_values_index[1][i]
    multi_imputed_array[row][col] = np.round(multi_imputed_array[row][col])

# convert to dataFrame
multi_imputed_df = pd.DataFrame(multi_imputed_array, columns=column_names)

### 6. Feature Selection on the testing dataset

In [17]:
feature_selection_list = ['original_shape_Elongation', 'original_firstorder_Kurtosis', 'original_firstorder_Maximum',
                          'original_shape_Maximum2DDiameterRow', 'original_gldm_SmallDependenceHighGrayLevelEmphasis',
                          'original_firstorder_Range', 'original_glszm_SizeZoneNonUniformityNormalized',
                          'original_shape_Maximum3DDiameter', 'original_glcm_MaximumProbability',
                          'original_firstorder_MeanAbsoluteDeviation', 'TumourStage', 'original_glszm_ZoneEntropy',
                          'original_firstorder_InterquartileRange', 'original_firstorder_Variance', 'Age',
                          'original_firstorder_Skewness', 'original_firstorder_90Percentile',
                          'original_shape_MajorAxisLength', 'original_firstorder_RobustMeanAbsoluteDeviation',
                          'original_shape_Maximum2DDiameterSlice'
                          ]

feature_selected_df = multi_imputed_df[feature_selection_list]

### 7. Feature Normalization

In [18]:
scaler = StandardScaler()
Xs_train = scaler.fit_transform(feature_selected_df)

### 8. Predict/Score the testing set

In [19]:
testing_predictions = random_forest_regressor.predict(Xs_train)

### 9. Save the 'ID' & 'Prediction' column into a spreadsheet file (.xls)

In [20]:
target_df = pd.DataFrame({'ID': ID_data, 'RelapseFreeSurvival (outcome)': testing_predictions})
target_df.to_excel('./Dataset/testDataset.xls', sheet_name='Sheet 1', index=False)