## Pathological Response Time (PCR) Classification Model

Kieran - Michael

In [None]:
import pandas as pd 
import numpy as np
import joblib


class Classifier():
    
    def __init__(self, data : str):
        
        self.data = pd.read_excel(data)
        self.ID = self.data['ID']
        
        self.scaler = joblib.load('classification_scaler.save')
        self.selected_features = np.load('classification_features.npy', allow_pickle=True)
        self.pca = joblib.load('classification_pca.pkl')
        self.model = joblib.load('bestModelClassifer.pkl')
        
        self.discrete_features = ['ER', 'PgR', 'HER2', 'TrippleNegative',
                     'ChemoGrade', 'Proliferation', 'HistologyType', 
                     'LNStatus', 'TumourStage', 'Gene']
    
    def set_outliers_to_z_score_limit(self, series): 
        # Exclude target features
        threshold_upper = 3 
        threshold_lower = -3
        
        mean = series.mean()
        std = series.std()
        
        replacement_upper_limit = (3 * std) + mean
        replacement_lower_limit = (-3 * std) + mean
        
        outlier_count= 0 
        for index, value in series.items():
            z_test = (value - mean) / std 
            
            if z_test > threshold_upper: 
                series[index] = replacement_upper_limit
                outlier_count+=1
                
            elif z_test < threshold_lower:
                series[index] = replacement_lower_limit
                outlier_count+=1
            
        #if outlier_count > 0:
            #print(f'column {series.name} had {outlier_count} outliers')
                    
        return series
        
    def stage_one_clean_data_of_missing_values(self):
        
        self.data.drop('ID', inplace=True, axis=1)
        self.data.replace(999, np.nan, inplace=True)
        
        for column in self.data:  
            self.data[column].fillna(self.data[column].median())
            
    def stage_two_check_for_outliers(self):
        self.data.apply(self.set_outliers_to_z_score_limit, axis=0)
        self.data.reset_index(drop=True, inplace=True)
        
    def stage_three_scale(self):
        
        #drop outcomes and discrete data
        continous_data = self.data.drop(columns=self.discrete_features, axis=1)    
        fitted_scaler = self.scaler.fit(continous_data)
        continuous_scaled = fitted_scaler.transform(continous_data)

        # Recombine data
        continuous_scaled_df = pd.DataFrame(continuous_scaled, columns=continous_data.columns)
        discrete_data = self.data[self.discrete_features]
        self.data = pd.concat([discrete_data.reset_index(drop=True), continuous_scaled_df.reset_index(drop=True)], axis=1)

    def stage_four_one_hot_encode(self):
        self.data = pd.get_dummies(self.data, columns=self.discrete_features)
        self.data.rename(columns={'HER2_0': 'HER2_0.0', 'HER2_1' : 'HER2_1.0'}, inplace=True)
        
    def stage_five_feature_selections(self):
        self.data = self.data[self.selected_features]
        
    def stage_six_pca(self):
        continous_data = self.data.drop(columns=['HER2_0.0','Gene_0.0','Gene_1.0','HER2_1.0','ER_0','ER_1'])
        discrete_data = self.data[['HER2_0.0','Gene_0.0','Gene_1.0','HER2_1.0','ER_0','ER_1']]
        transfromed_continous_data = pd.DataFrame(self.pca.transform(continous_data))
        x = pd.concat([discrete_data.reset_index(drop=True), transfromed_continous_data.reset_index(drop=True)], axis=1)
        x.columns = x.columns.astype(str)
        self.data = x
        
    def predict(self): 
        pred = self.model.predict(self.data)
        pred_series = pd.Series(pred, name='Prediction pCR')
        id_pred_df = pd.concat([self.ID.reset_index(drop=True), pred_series.reset_index(drop=True)], axis=1)
        id_pred_df.to_csv('Classification_prediction.csv')
        
        
    def run(self):
        self.stage_one_clean_data_of_missing_values()
        self.stage_two_check_for_outliers()
        self.stage_three_scale()
        self.stage_four_one_hot_encode()
        self.stage_five_feature_selections()
        self.stage_six_pca()
        #print(self.selected_features)
        self.predict()
        
classifer = Classifier('FinalTestDataset2024.xls')
classifer.run()
        
        
        

(133, 131)
(133, 52)


  series[index] = replacement_upper_limit
