In [None]:
import numpy as np
import pandas as pd
from Binary_Logistic_Regression import BinaryLogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

In [None]:
training_data = pd.read_csv('aps_failure_training_set.csv', skiprows=20)
testing_data = pd.read_csv('aps_failure_test_set.csv', skiprows=20)

In [None]:
class Scania_APS_Failure(BinaryLogisticRegression):
    def __init__(self,training, testing, n_comp, non_na_thresh, LR=10**(-6), ET=10**(-5)):
        
        self.n_comp = n_comp
        self.training = training.copy()
        self.testing = testing.copy()
        
        self.training.replace(to_replace='na', value=np.nan, inplace=True)
        self.testing.replace(to_replace='na', value=np.nan, inplace=True)
        
        self.training.dropna(axis=1, thresh=int(non_na_thresh*self.training.shape[0]), inplace=True)
        self.testing = self.testing[self.training.columns]
        
        training_label = self.training['class']
        training_label.replace(to_replace=['pos','neg'], value=[1,0], inplace=True)
        testing_label = self.testing['class']
        testing_label.replace(to_replace=['pos','neg'], value=[1,0], inplace=True)
        
        imputer = SimpleImputer()
        imputed_training = imputer.fit_transform(self.training.iloc[:,1:])
        imputed_testing = imputer.fit_transform(self.testing.iloc[:,1:])
        
        self.training = pd.DataFrame(imputed_training, columns=self.training.columns[1:])
        self.testing = pd.DataFrame(imputed_testing, columns=self.testing.columns[1:])
        
        for column in self.training:
            self.training[column] = pd.qcut(x=self.training[column], q=10, duplicates='drop').cat.codes
            self.testing[column] = pd.qcut(x=self.testing[column], q=10, duplicates='drop').cat.codes
        
        self.training = self.__one_hot_encoding(self.training)
        self.testing = self.__one_hot_encoding(self.testing)
        self.testing['ad_000_8'] = np.array([0 for i in range(self.testing.shape[0])])
        
        self.training = (self.training - self.training.mean(axis=0))/self.training.std(axis=0)
        self.testing = (self.testing - self.testing.mean(axis=0))/self.testing.std(axis=0)
        
        self.training.dropna(axis=1, thresh=int(non_na_thresh*self.training.shape[0]), inplace=True)
        self.testing = self.testing[self.training.columns]
        
        self.__eigen_vector_calculation()
        
        self.training = np.matmul(np.array(self.training),self.Q)
        self.testing = np.matmul(np.array(self.testing),self.Q)
        
        self.training, training_label = SMOTE(sampling_strategy='minority').fit_resample(X=self.training, y=training_label)
        self.training = pd.DataFrame(self.training)
        self.training['class'] = training_label
        
        self.testing = pd.DataFrame(self.testing)
        self.testing['class'] = testing_label
        self.testing['class'].dropna(axis=0, inplace=True)
        
        X_cv, X_test, Y_cv, Y_test = train_test_split(self.testing.iloc[:,:-1], self.testing['class'], test_size=1/3)
        
        self.cv = pd.DataFrame(X_cv)
        self.cv['class'] = Y_cv
        
        self.testing = pd.DataFrame(X_test)
        self.testing['class'] = Y_test
        
        super().__init__(1, LearningRate=LR, ErrorTolerance=ET)
        
        del X_cv, X_test, Y_cv, Y_test, column, imputed_testing, imputed_training, imputer, n_comp, non_na_thresh, testing_label, training_label
        
        
    def __eigen_vector_calculation(self):
        data = np.array(self.training)
        sigma_hat = (1/self.training.shape[0]) * np.matmul(data.T,data)
        self.Q = np.linalg.svd(sigma_hat)[0][:,:self.n_comp]
        
    def __one_hot_encoding(self, df):
        data_array = list()
        column_name = list()
        for column in df.columns:
            unique = df[column].unique()
            size = len(df[column].unique())
            df[column].replace(to_replace=unique, value=range(size), inplace=True)
            data_array.append(np.eye(size, size)[df[column]])
            for u in unique:
                column_name.append(column + '_' + str(u))
        return pd.DataFrame(np.concatenate(data_array, axis=1), columns=column_name)

In [None]:
def evaluate(predicted, actual):
    TP = np.count_nonzero((predicted == 1) & (actual == 1))
    TN = np.count_nonzero((predicted == 0) & (actual == 0))
    FP = np.count_nonzero((predicted == 1) & (actual == 0))
    FN = np.count_nonzero((predicted == 0) & (actual == 1))
    
    if (TP + TN + FP + FN) == 0:
        accuracy = 0
    else:
        accuracy = (TP + TN)/(TP + TN + FP + FN)
    
    if (TP + FP) == 0:
        precision = 0
    else:
        precision = TP/(TP + FP)
    
    if (TP + FN) == 0:
        recall = 0
    else:
        recall = TP/(TP + FN)
        
    if (precision + recall) == 0:
        f1_score = 0
    else:
        f1_score = (2 * precision * recall)/(precision + recall)
    
    return (accuracy, precision, recall, f1_score)

In [None]:
Results = dict()
best = {'Accuracy':0, 'Precision':0, 'Recall':0, 'F1 Score':0, 'Best Obj':'obj', 'Parameter':()}

# Trying Different Combinaitions of Hyperparameters

In [None]:
for non_na_thresh in np.arange(0.50, 0.96, 0.05): 
    for n_comp in range(2,537,19):
        for LR in [5,6,7,8,9,10]:
            for ET in [5,6,7,8,9,10]:
                obj = Scania_APS_Failure(training_data, testing_data, n_comp, non_na_thresh, LR=LR, ET=ET)
                obj.fit(obj.training.iloc[:,:-1],obj.training['class'])
                obj.predict(obj.cv.iloc[:,:-1])
                Results[(non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')] = evaluate(obj.predicted_labels, obj.cv['class'])
                if (Results[(non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')][0] + Results[(non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')][1] + Results[(non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')][2]) > (best['Accuracy'] + best['Precision'] + best['Recall']):
                    best['Accuracy'] = Results[(non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')][0]
                    best['Precision'] = Results[(non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')][1]
                    best['Recall'] = Results[(non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')][2]
                    best['F1 Score'] = Results[(non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')][3]
                    best['Best Obj'] = obj
                    best['Parameter'] = (non_na_thresh, n_comp, f'10**(-{LR})', f'10**(-{ET})')
                else:
                    del obj

In [None]:
best

# From over 10440 Hyperparameter combinaitons we find our best model¶

In [None]:
best_obj = best['Best Obj']

In [None]:
best_obj.predict(best_obj.testing.iloc[:,:-1])

In [None]:
evaluate(best_obj.predicted_labels, best_obj.testing['class'])