# Test model for classification

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import scipy
from scipy.stats import pearsonr
import sklearn
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import joblib

In [2]:
# Read both train dataset and test dataset
tf=pd.read_excel("TestDatasetExample.xls") # Change the file name here.
df=pd.read_excel("TrainDataset2023.xls")
tf.head() 
tf.describe()

Unnamed: 0,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,TumourStage,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,58.371891,0.333333,0.333333,0.333333,0.666667,2.333333,2.333333,333.666667,0.333333,3.0,...,0.277656,0.277655,3.202574,0.003907,2649985.0,126.097192,0.004985,0.089091,0.00913,0.004266
std,1.563767,0.57735,0.57735,0.57735,0.57735,0.57735,1.154701,576.195569,0.57735,1.0,...,0.072521,0.07252,0.509325,0.003051,2162805.0,99.748099,0.005479,0.029038,0.005321,0.004599
min,56.881588,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,2.0,...,0.194591,0.194591,2.846439,0.001281,173658.5,23.967478,0.001334,0.05589,0.003163,0.001194
25%,57.557837,0.0,0.0,0.0,0.5,2.0,2.0,1.0,0.0,2.5,...,0.252295,0.252294,2.910878,0.002233,1890740.0,77.506009,0.001834,0.078759,0.007003,0.001622
50%,58.234086,0.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0,3.0,...,0.309999,0.309996,2.975317,0.003185,3607821.0,131.044541,0.002335,0.101628,0.010844,0.002051
75%,59.117043,0.5,0.5,0.5,1.0,2.5,3.0,500.0,0.5,3.5,...,0.319188,0.319187,3.380641,0.005219,3888148.0,177.162048,0.00681,0.105691,0.012113,0.005802
max,60.0,1.0,1.0,1.0,1.0,3.0,3.0,999.0,1.0,4.0,...,0.328377,0.328377,3.785966,0.007253,4168474.0,223.279556,0.011285,0.109755,0.013383,0.009553


In [3]:
# save IDs for output 
IDs = tf['ID']
tf.drop('ID', axis=1, inplace=True) # drop ID for prediction

In [4]:
# Check if there exists any missing value
tf.replace(999, np.nan, inplace=True)
for col in tf.columns:
    if tf[col].isnull().any():
        print(col, 'has missing values')
        if col == 'PgR' or 'HER2' or 'LNStatus' or 'TrippleNegative' or 'ChemoGrade': # using mode of train dataset to fill 
            tf[col] = tf[col].fillna(df[col].mode()[0])
        else:
            tf[col] = tf[col].fillna(df[col].median()) # using median of train dataset
tf.describe()

HistologyType has missing values


Unnamed: 0,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,TumourStage,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,58.371891,0.333333,0.333333,0.333333,0.666667,2.333333,2.333333,1.0,0.333333,3.0,...,0.277656,0.277655,3.202574,0.003907,2649985.0,126.097192,0.004985,0.089091,0.00913,0.004266
std,1.563767,0.57735,0.57735,0.57735,0.57735,0.57735,1.154701,0.0,0.57735,1.0,...,0.072521,0.07252,0.509325,0.003051,2162805.0,99.748099,0.005479,0.029038,0.005321,0.004599
min,56.881588,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,2.0,...,0.194591,0.194591,2.846439,0.001281,173658.5,23.967478,0.001334,0.05589,0.003163,0.001194
25%,57.557837,0.0,0.0,0.0,0.5,2.0,2.0,1.0,0.0,2.5,...,0.252295,0.252294,2.910878,0.002233,1890740.0,77.506009,0.001834,0.078759,0.007003,0.001622
50%,58.234086,0.0,0.0,0.0,1.0,2.0,3.0,1.0,0.0,3.0,...,0.309999,0.309996,2.975317,0.003185,3607821.0,131.044541,0.002335,0.101628,0.010844,0.002051
75%,59.117043,0.5,0.5,0.5,1.0,2.5,3.0,1.0,0.5,3.5,...,0.319188,0.319187,3.380641,0.005219,3888148.0,177.162048,0.00681,0.105691,0.012113,0.005802
max,60.0,1.0,1.0,1.0,1.0,3.0,3.0,1.0,1.0,4.0,...,0.328377,0.328377,3.785966,0.007253,4168474.0,223.279556,0.011285,0.109755,0.013383,0.009553


In [5]:
# load power transformer
pt_loaded = joblib.load('PCR_power_transformer.pkl')
tf_transformed = pt_loaded.transform(tf)

In [6]:
# load standard scaler
scaler_loaded = joblib.load('PCR_standard_scaler.pkl')
scaler_transformed = scaler_loaded.transform(tf_transformed)

In [7]:
import pandas as pd

# Load the feature importances
importance_df = pd.read_csv('PCR_feature_importances.csv')
importances = importance_df['Importance'].values
feature_names = importance_df['Feature'].values

# Apply the same threshold the same as train data
threshold = np.sort(importances)[-100]

# Load the PCA model from the file
pca = joblib.load('PCR_pca.pkl')

In [8]:
# Apply feature selection the same as train data
Xs_test_rf = scaler_transformed[:, importances >= threshold]
X_pca_test = pca.transform(Xs_test_rf)

In [9]:
# load random forest regressor
import joblib
mlp_classifier_loaded = joblib.load('PCR_mlp_model.pkl')

In [10]:
# Predict
predictions = mlp_classifier_loaded.predict(X_pca_test)
print(predictions)

# Convert result to dataframe
predictions_df = pd.DataFrame(predictions, columns=['pCR'])
# concat id and results
result_df = pd.concat([IDs, predictions_df], axis=1)

# Save results to csv file
result_df.to_csv('OUTPUT_PCR_predictions.csv', index=False)

[1 1 0]
