<a href="https://colab.research.google.com/github/HungYangChang/ECSE551/blob/master/Miniproject1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import relevant modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

# Read Data Sets (Bankrupcy and Hepatitis)

In [None]:
# Load bankruptcy data
url = "https://raw.githubusercontent.com/jonarsenault/ecse551data/master/bankrupcy.csv"
bank_data = pd.read_csv(url)

# Display some of the data
print(bank_data.head())

# Print size of data
bank_data.shape

In [None]:
# Load hepatitis data
url = "https://raw.githubusercontent.com/jonarsenault/ecse551data/master/hepatitis.csv"
hep_data = pd.read_csv(url)

# Display some of the data
print(hep_data.head())

# Print size of data
hep_data.shape

# Indices of numerical and categorical features
index_cat_columns_raw = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])
index_num_columns_raw = np.array([0, 13, 14, 15, 16, 17])

# Define utility functions for logistic regression
- shuffle_data
- splitdata
- standardization
- sigmoid
- log_transform


In [None]:
def shuffle_data(X, y, random_seed=None):
  """Shuffle the data to randomize tests"""

  if random_seed is not None:
    np.random.seed(random_seed)

  # Copy data
  X_original_copy = X.copy()
  y_original_copy = y.copy()

  # Concatenate and shuffle
  full_array = np.concatenate((X_original_copy, y_original_copy), axis=1)
  np.random.shuffle(full_array)

  # Split into features and labels
  X_shuffle = full_array[:,:-1]
  y_shuffle  = full_array[:,[-1]]

  return X_shuffle, y_shuffle

def splitdata(X, y, perc_training, random_flag=True):
  """Split data into training and testing set (by rows(observations)) by taking a constant set"""

  num_rows = X.shape[0]
  num_rows_train = int(num_rows * perc_training )
  num_rows_test = num_rows - num_rows_train

  X_train = X[:num_rows_train, :]
  X_test =  X[num_rows_train:, :]
  y_train = y[:num_rows_train]
  y_test = y[num_rows_train:]

  return X_train, y_train, X_test, y_test

def standardization(data, training_data):
  """Standardize each column of input data"""

  data_standardized = (data - training_data.mean(axis=0))/training_data.std(axis=0)

  return data_standardized

def sigmoid(x):
  """Apply logistic sigmoid function to input"""

  return 1 / (1 + np.exp(-x))


def log_transform(X_data, feature, replace=True, bank=False):
  """Perform log transform if feature may not be normally distributed"""

  if replace:
    # Features to be replaced with their log transform

    for i in feature:
      # Check if mean is close to median
      variance = np.absolute(np.mean(X_data[:,i])-np.median(X_data[:,i]))/np.mean(X_data[:,i])
      if (variance > 0.1):
        if bank:
          # For bankruptcy data, ensure data is positive
          X_data[:,i] = X_data[:,i] + np.absolute(np.min(X_data[:,i]))+0.5
          X_data[:,i] = np.log(X_data[:,i])
        else:
          X_data[:,i] = np.log(X_data[:,i])
    
  else:
    # Append log transform to exisitng features

    for i in feature:
      # Check if mean is close to median
      variance = np.absolute(np.mean(X_data[:,i])-np.median(X_data[:,i]))/np.mean(X_data[:,i])
      if (variance > 0.1):
        if bank:
          # For bankruptcy data, ensure data is positive
          X_data[:,i] = X_data[:,i] + np.absolute(np.min(X_data[:,i]))+0.5
          X_data = np.insert(X_data, X_data.shape[1]-1, np.log(X_data[:,i]), axis=1)
        else:
          X_data = np.insert(X_data, X_data.shape[1]-1, np.log(X_data[:,i]), axis=1)

  return X_data

# Logistic Classifier Class

In [None]:
class LogisticClassifier():
  """Class defining a logistic classifier"""
  
  def __init__(self, weights):
    """Constructor"""

    self._w = weights
  
  def fit(self, x_train, y_train, max_iters, tolerance=1e-2, print_results=True, store_w_iteration= False, learning_rate="dependent"):
    """Fit a logistic regression model to data"""

    # Lists to store (1) weights at each iteration or (2) just last weights
    weight_store_fit = []

    iteration = 1
    delta_weights = 1e6

    while (delta_weights > tolerance) & (iteration < max_iters):

      # Set learning rate for this iteration
      if learning_rate == "dependent":
        alpha = 1/(1 + iteration)
      elif learning_rate == "small":
        alpha = 0.1
      else:
        alpha = 0.5

      # Store current weights before updating
      weights_previous = self._w

      # Compute gradient of cross-entropy loss
      gradient = np.sum(
      x_train * (y_train - sigmoid(np.dot(x_train, weights_previous))), axis=0
      ).reshape(-1, 1)

      # Update weights
      self._w = weights_previous + alpha * gradient

      # Compute change in weights
      delta_weights = np.linalg.norm(self._w - weights_previous) ** 2    

      if (store_w_iteration == True):
      # Store weights at each iteration
        weight_store_fit.append(self._w.flatten()) #TODOO

      iteration += 1  

    if (store_w_iteration == False):
    # Store final weights 
      weight_store_fit.append(self._w.flatten()) 
      # print ("Final weights:", weight_store_fit)

    # Compute training accuracy
    y_pred_train = self.predict(x_train)

    accuracy = self.accu_eval(y_train, y_pred_train)

    if print_results:
      if iteration==max_iters:
        print (f"Failed to converge in {max_iters} iterations")
      else:
        print (f"Model converged in {max_iters} iterations")
      print(f"Training accuracy: {100*accuracy:.2f}")

    return accuracy, iteration, weight_store_fit



  def predict(self, X):
    """Predict the class labels of a given set of samples"""

    # Decision boundary
    decision_boundary = 0.5    

    # Obtain probability of each sample
    y_pred_prob = sigmoid(np.dot(X, self._w))

    # Assign class labels based on decision boundary
    y_pred = np.where(y_pred_prob < decision_boundary, 0, 1)

    return y_pred
  

  def accu_eval(self, y_true, y_pred):
    """Compute accuracy of model""" 

    accuracy = np.count_nonzero(y_true == y_pred) / len(y_true)

    return accuracy

  def cross_entropy_loss(self, x_test, y_test):
    """Compute cross entropy loss"""

    y_pred_prob = sigmoid(np.dot(x_test,self._w))
    y_pred_prob_m1 = 1 - y_pred_prob

    # Replace small values in both with 1e-5 to avoid NAN (log0)
    y_pred_prob = np.where(y_pred_prob < 1e-5, 1e-5, y_pred_prob)
    y_pred_prob_m1 = np.where(y_pred_prob_m1 < 1e-5, 1e-5, y_pred_prob_m1)

    loss_0 = y_test * np.log(y_pred_prob)
    loss_1 = (1-y_test) * np.log(y_pred_prob_m1)

    loss = -np.sum(loss_0 + loss_1)

    return loss


# K-fold Cross Validation

In [None]:
def kfold_cross_validation(model, X_train, y_train, k=10, tolerance = 1e-2, standardize_idx=None, printresult=True, learning_rate="dependent"):
  """Perform k-fold cross validation"""

  # Compute the amount of samples in each fold

  fold_size = int(len(X_train)/k) 
  if printresult:
    print ("Now doing k-fold cross validation, fold size= {}, x train shape = {}, y train shape = {}".format(fold_size, X_train.shape, y_train.shape))

  accuracy_train_store = []
  accuracy_test_store = []
  iteration_store = []
  weight_store_fold = []
  fold_CE_store = []
  
  # Store initial weights to use the same for each fold
  initial_weights = model._w

  for fold_number in range(k):

    # Reset weights
    model._w = initial_weights


    # Split data
    index_start = fold_size*fold_number
    index_end = fold_size*(fold_number+1)

    if fold_number == (k-1):
      # For final fold
      X_train_fold = X_train[:index_start,:]
      y_train_fold = y_train[:index_start,:]
      X_validation_fold = X_train[index_start:,:]
      y_validation_fold = y_train[index_start:,:]
    
    else:
      # For all other folds fold
      X_train_fold = np.concatenate((X_train[:index_start,:], X_train[index_end:,:]),axis=0)
      y_train_fold = np.concatenate((y_train[:index_start,:], y_train[index_end:,:]),axis=0)
      X_validation_fold = X_train[index_start:index_end,:]
      y_validation_fold = y_train[index_start:index_end,:]


    # Standardize if required
    X_train_fold_original = X_train_fold.copy()
    if standardize_idx == "all":
      # Standardize all columns
      X_train_fold[:,1:] = standardization(X_train_fold[:,1:], X_train_fold_original[:,1:])
      X_validation_fold[:,1:] = standardization(X_validation_fold[:,1:], X_train_fold_original[:,1:])
    elif standardize_idx is not None:
      # Standardize subset of columns
      X_train_fold[:, standardize_idx] = standardization(X_train_fold[:, standardize_idx], X_train_fold_original[:,standardize_idx])
      X_validation_fold[:, standardize_idx] = standardization(X_validation_fold[:, standardize_idx], X_train_fold_original[:,standardize_idx])
    else:
      # Do not standardize
      pass

    # Fit the model
    t_start = time.time()
    accuracy_train, iteration, weight_store = model.fit(X_train_fold, y_train_fold, max_iters = 15000, tolerance=tolerance, print_results=False, learning_rate=learning_rate)
    t_end = time.time()

    # Compute accuracy and cross-entropy loss
    y_pred_test = model.predict(X_validation_fold)
    accuracy_test = model.accu_eval(y_validation_fold, y_pred_test)
    cross_entropy = model.cross_entropy_loss(X_validation_fold,y_validation_fold)

    # Store values
    accuracy_train_store.append(accuracy_train)
    accuracy_test_store.append(accuracy_test)
    iteration_store.append(iteration)
    fold_CE_store.append(cross_entropy)
    weight_store_fold.append(weight_store)
    
    if printresult:
      print(f"### Fold number {fold_number+1} ###")
      print(f"Execution time: {t_end-t_start:.3f}s")
      print(f"Training Accuracy: {100*accuracy_train:.2f} %")
      print(f'Testing Accuracy: {100*accuracy_test:.2f} %')
      print(f'Cross-entropy loss CE: {cross_entropy}')

  cross_accuracy = np.mean(accuracy_test_store)
  if printresult:
    print("###########################")
    print (f"Mean testing accuracy is {cross_accuracy*100:.2f} %" )
  
  return accuracy_train_store, accuracy_test_store, iteration_store, weight_store_fold


# Define a function to easily run experiments

In [None]:
def test_classifier(model, X, y, num_folds=None, num_loops=1, standardize_idx=None, tolerance=1e-2, max_iters=1000, random_seed=None, print_results=False, learning_rate="dependent"):
  """Test the logistic regression"""

  if num_folds is None:
  # Do not perform k-fold cv

    # Shuffle data
    if random_seed is not None:
      X, y = shuffle_data(X, y, random_seed)

    X_train, y_train, X_test, y_test = splitdata(X, y, 0.8)

    X_train_original = X_train.copy()
    if standardize_idx == "all":
      # Standardize all columns
      X_train = standardization(X_train, X_train_original)
      X_test = standardization(X_test, X_train_original)
    elif standardize_idx is not None:
      # Standardize subset of columns
      X_train[:, standardize_idx] = standardization(X_train_original[:, standardize_idx], X_train_original[:,standardize_idx])
      X_test[:, standardize_idx] = standardization(X_test[:, standardize_idx], X_train_original[:,standardize_idx])
    else:
      # Do not standardize
      pass

    accuracy_train, iterations, weight_store = model.fit(X_train, y_train, max_iters, tolerance, print_results)
    accuracy_test = model.accu_eval(y_test, model.predict(X_test))
  else:
    # Perform k-fold cross-validation
    iterations = []
    accuracy_train = []
    accuracy_test = []
    weight_store = []
    for i in range(num_loops):

      if random_seed is not None:
        X, y = shuffle_data(X, y, random_seed*(i+1))
        
      accuracy_train_iter, accuracy_test_iter,  iterations_iter, weight_store_iter = \
      kfold_cross_validation(model, X, y, num_folds, tolerance, standardize_idx=standardize_idx, printresult=print_results, learning_rate=learning_rate)

      accuracy_train.append(accuracy_train_iter)
      accuracy_test.append(accuracy_test_iter)
      iterations.append(iterations_iter)
      weight_store.append(weight_store_iter)

  return accuracy_train, accuracy_test, iterations, weight_store

# Set baseline accuracy and iterations for hepatitis data
- Initial weights: zeros
- Learning rate = 1/1+k
- Stopping criteria: epsilon = 1e-2
- No standardization

In [None]:
# RIGHT NOW, BASELINE IS DEFINED AS THE RESULT OF
# (1)ONE RANDOM 10-FOLD CV, (2)NO STANDARDIZATION, (3)WITH SHUFFLE (random_seed)

# Set random seed, num_fold, num_loops
seed = 0
fold = 10
loops = 10

# Create original set of features
X_hep_original_no_bias = hep_data.iloc[:, :-1].to_numpy()
X_hep_original = np.insert(
    X_hep_original_no_bias, 0, np.ones(X_hep_original_no_bias.shape[0]), axis=1
)
y_hep_original = hep_data.iloc[:, -1].to_numpy().reshape(-1, 1)

# Account for extra column for bias
index_num_columns = index_num_columns_raw + 1
index_cat_columns = index_cat_columns_raw + 1

# Create an instance of the model object
initial_weights = np.zeros((X_hep_original.shape[1], 1))
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_original, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,
                                                            random_seed=seed,
                                                            learning_rate="dependent")

baseline_accuracy_hep = np.mean(accuracy_test)
baseline_iterations_hep = np.mean(iterations)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*baseline_accuracy_hep:.2f} %")
print(f"Number of iterations: {baseline_iterations_hep}")

plt.title('k-fold accuracy')
plt.xlabel('fold number'), plt.ylabel('accuracy')
plt.plot(accuracy_test[0])
plt.show()

plt.title('k-fold iteration')
plt.xlabel('fold number'), plt.ylabel('iteration')
plt.plot(iterations[0])
plt.show()

# Hepatitis Test 1: Standardization

In [None]:
# Create an instance of the model object
initial_weights = np.zeros((X_hep_original.shape[1], 1))
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_original, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=index_num_columns,
                                                            random_seed=seed)

print(f"Baseline testing accuracy: {100*baseline_accuracy_hep:.2f} %")
print(f"Baseline iterations: {baseline_iterations_hep}")

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")


# Hepatitis Test 2: Initial Weights

In [None]:
# Create an instance of the model object, weights set between 0 and 1
initial_weights = np.random.rand(X_hep_original.shape[1],1)
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_original, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed)

print(f"Baseline testing accuracy: {100*baseline_accuracy_hep:.2f} %")
print(f"Baseline iterations: {baseline_iterations_hep}")

print("#### Test group 1: Initial weight random 0 to 1 ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")



# Create an instance of the model object, weights set between 0 and 1
initial_weights = np.random.rand(X_hep_original.shape[1],1) * 100
model = LogisticClassifier(initial_weights)


# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_original, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed)

print("#### Test group 2: Initial weight random 0 to 100 ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")

# Create an instance of the model object, weights set between 0 and 1
initial_weights = np.random.randn(X_hep_original.shape[1],1)
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_original, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed)

print("#### Test group 3: Initial weight drawn from standard normal ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")




# Hepatitis Test 3: Stopping Condition

In [None]:
# Create an instance of the model object, weights set between 0 and 1
initial_weights =  np.zeros((X_hep_original.shape[1], 1))
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_original, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,
                                                            random_seed=seed,
                                                            tolerance=1e-6)

print(f"Baseline testing accuracy: {100*baseline_accuracy_hep:.2f} %")
print(f"Baseline iterations: {baseline_iterations_hep}")

print("#### Test group 1: Tolerance 1e-6 ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")


# Create an instance of the model object, weights set between 0 and 1
initial_weights =  np.zeros((X_hep_original.shape[1], 1))
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store  = test_classifier(model, X_hep_original, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed,
                                                            tolerance=1e-9)

print(f"Baseline testing accuracy: {100*baseline_accuracy_hep:.2f} %")
print(f"Baseline iterations: {baseline_iterations_hep}")

print("#### Test group 2: Tolerance 1e-9 ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")


# Hepatitis Test 4: Removing independent features

In [None]:
# indices of independent features to remove
features_to_remove = [2, 3, 4, 8, 9, 16]
initial_weights = np.zeros((X_hep_original.shape[1], 1))
model = LogisticClassifier(initial_weights)
Improve_list_removing = []

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_original, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,
                                                            random_seed=seed,)

print(f"Baseline testing accuracy: {100*baseline_accuracy_hep:.2f} %")
print(f"Baseline iterations: {baseline_iterations_hep}")

for i in features_to_remove:
  print(f"Removing {hep_data.columns[i-1]}...")
  X_hep_remove = np.delete(X_hep_original, i, axis = 1)

  initial_weights = np.zeros((X_hep_remove.shape[1], 1))
  model = LogisticClassifier(initial_weights)

  accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_remove, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,
                                                            random_seed=seed,)
  print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
  print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
  print(f"Number of iterations: {np.mean(iterations)}") 
  if (np.mean(accuracy_test)>baseline_accuracy_hep):
    Improve_list_removing.append(i)

# Remove all independent features
print(f"Removing all independent features...")
X_hep_remove = np.delete(X_hep_original, features_to_remove, axis=1)

initial_weights = np.zeros((X_hep_remove.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_remove, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,
                                                            random_seed=seed,)
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 

# Remove accu-improved independent features
print(f"Removing all independent features...")
print ("List of removing independent feature which accuracy improve",Improve_list_removing)
X_hep_remove = np.delete(X_hep_original, Improve_list_removing, axis=1)

initial_weights = np.zeros((X_hep_remove.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_remove, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,
                                                            random_seed=seed,)
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_hep):.2f} %")

# Hepatitis Test 5: Log transform

In [None]:
feature_to_test = index_num_columns

print(f"Baseline testing accuracy: {100*baseline_accuracy_hep:.2f} %")
print(f"Baseline iterations: {baseline_iterations_hep}")



# Test 1: replace selected features with log, no standardization
print(f"----------------------------------------------")
print(f"Test 1: replace selected features with log, w/o standardization")
X_hep_log = X_hep_original.copy()
X_hep_log_1 = log_transform(X_hep_log, feature_to_test)
initial_weights = np.zeros((X_hep_log_1.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_log_1, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,                                                            
                                                            random_seed=seed,)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_hep):.2f} %")

# Test 2: replace selected features with log, no standardization
print(f"----------------------------------------------")
print(f"Test 2: replace selected features with log, w/ standardization")
X_hep_log = X_hep_original.copy()
X_hep_log_2 = log_transform(X_hep_log, feature_to_test)
initial_weights = np.zeros((X_hep_log_2.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_log_2, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,                                                             
                                                            standardize_idx=index_num_columns,
                                                            random_seed=seed,)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_hep):.2f} %")

# Test 3: append selected features with log, no standardization
print(f"----------------------------------------------")
print(f"Test 3: append log transform of selected features, w/o standardization")
X_hep_log = X_hep_original.copy()
X_hep_log_3 = log_transform(X_hep_log, feature_to_test, replace=False)
initial_weights = np.zeros((X_hep_log_3.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_log_3, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,                                                            
                                                            random_seed=seed,)


print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_hep):.2f} %")

# Test 4: append selected features with log, with standardization
print(f"----------------------------------------------")
print(f"Test 4: append log transform of selected features, w standardization")
X_hep_log = X_hep_original.copy()
X_hep_log_4 = log_transform(X_hep_log, feature_to_test, replace=False)
initial_weights = np.zeros((X_hep_log_4.shape[1], 1))
model = LogisticClassifier(initial_weights)

# TODO Need to add new features to standardization
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_log_4, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,                                                     
                                                            standardize_idx=index_num_columns,
                                                            random_seed=seed,)


print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_hep):.2f} %")




# Hepatitis Test 6: Final results

Test (Anne 10/11)
1. Log append only, init 0-1, +1% (Best??)
2. Delete only, init 0-1, +0.5%
3. Delete+log append, init 0-1, +0.7%
4. Log append only, init 0s, +0.5%
5. Delete only, init 0s, +0.51%
6. Delete+log append, init 0s, +0.48%

In [None]:
#feature_to_test = index_num_columns

print(f"Baseline testing accuracy: {100*baseline_accuracy_hep:.2f} %")
print(f"Baseline iterations: {baseline_iterations_hep}")

X_hep_final = X_hep_original.copy()

# Remove features 
X_hep_final = np.delete(X_hep_final, [2, 3, 9], axis=1)

# New locations of numerical features
index_num_columns_new = np.array([1, 11, 12, 13, 14, 15])

# X_hep_final = log_transform(X_hep_final, index_num_columns_new, replace=False)
feature_to_test = index_num_columns_new.copy()
X_hep_final = log_transform(X_hep_final, feature_to_test, replace=False)
#X_hep_final = log_transform(X_hep_final, feature_to_test)

initial_weights = np.random.rand(X_hep_final.shape[1],1)
#initial_weights = np.zeros((X_hep_final.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_hep_final, 
                                                            y_hep_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,                                                            
                                                            random_seed=seed,
                                                            tolerance=1e-2)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_hep):.2f} %")

# Set baseline accuracy and iterations for bankruptcy data
- Initial weights: zeros
- Learning rate = 1/1+k
- Stopping criteria: epsilon = 1e-2
- No standardization

In [None]:
# RIGHT NOW, BASELINE IS DEFINED AS THE RESULT OF
# (1)ONE RANDOM 10-FOLD CV, (2)NO STANDARDIZATION, (3)WITH SHUFFLE (random_seed)

# Set random seed, num_fold, num_loops
seed = 10
fold = 10
loops = 1

# Create original set of features
X_bank_original_no_bias = bank_data.iloc[:, :-1].to_numpy()
X_bank_original = np.insert(
    X_bank_original_no_bias, 0, np.ones(X_bank_original_no_bias.shape[0]), axis=1
)
y_bank_original = bank_data.iloc[:, -1].to_numpy().reshape(-1, 1)


# Create an instance of the model object
initial_weights = np.zeros((X_bank_original.shape[1], 1))
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_original, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed)

baseline_accuracy_bank = np.mean(accuracy_test)
baseline_iterations_bank = np.mean(iterations)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*baseline_accuracy_bank:.2f} %")
print(f"Number of iterations: {baseline_iterations_bank}")

plt.title('k-fold accuracy')
plt.xlabel('fold number'), plt.ylabel('accuracy')
plt.plot(accuracy_test[0])
plt.show()


plt.title('k-fold iteration')
plt.xlabel('fold number'), plt.ylabel('iteration')
plt.plot(iterations[0])
plt.show()


# Bankruptcy Test 1: Standardization

In [None]:
# Create an instance of the model object
initial_weights = np.zeros((X_bank_original.shape[1], 1))
model = LogisticClassifier(initial_weights)


# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_original, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx="all",
                                                            random_seed=seed)

print(f"Baseline testing accuracy: {100*baseline_accuracy_bank:.2f} %")
print(f"Baseline iterations: {baseline_iterations_bank}")

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")

# Bankruptcy Test 2: Initial Weights


In [None]:
# Create an instance of the model object, weights set between 0 and 1
initial_weights = np.random.rand(X_bank_original.shape[1],1)
model = LogisticClassifier(initial_weights)


# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_original, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed)

print(f"Baseline testing accuracy: {100*baseline_accuracy_bank:.2f} %")
print(f"Baseline iterations: {baseline_iterations_bank}")

print("#### Test group 1: Initial weight random 0 to 1 ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")



# Create an instance of the model object, weights set between 0 and 1
initial_weights = np.random.rand(X_bank_original.shape[1],1) * 100
model = LogisticClassifier(initial_weights)


# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_original, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed)

print("#### Test group 2: Initial weight random 0 to 100 ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")

# Create an instance of the model object, weights set between 0 and 1
initial_weights = np.random.randn(X_bank_original.shape[1],1)
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_original, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed)

print("#### Test group 3: Initial weight drawn from standard normal ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")



# Bankruptcy Test 3: Stopping Condition

In [None]:
# Create an instance of the model object, weights set between 0 and 1
initial_weights =  np.zeros((X_bank_original.shape[1], 1))
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_original, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,
                                                            random_seed=seed,
                                                            tolerance=1e-6)

print(f"Baseline testing accuracy: {100*baseline_accuracy_bank:.2f} %")
print(f"Baseline iterations: {baseline_iterations_bank}")

print("#### Test group 1: Tolerance 1e-6 ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")


# Create an instance of the model object, weights set between 0 and 1
initial_weights =  np.zeros((X_bank_original.shape[1], 1))
model = LogisticClassifier(initial_weights)

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store  = test_classifier(model, X_bank_original, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed,
                                                            tolerance=1e-9)

print(f"Baseline testing accuracy: {100*baseline_accuracy_bank:.2f} %")
print(f"Baseline iterations: {baseline_iterations_bank}")

print("#### Test group 2: Tolerance 1e-9 ####")
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}")


# Bankruptcy Test 4: Removing independent features

In [None]:
# generate indices of independent features to remove
Threshold = 0.15
correlation = bank_data.corr()
correlation_arr = np.abs(correlation.to_numpy())
features_to_remove = []
count = 0
for i in (correlation_arr[:,-1]):
  if (i<Threshold):
    features_to_remove.append(count)
  count +=1


initial_weights = np.zeros((X_bank_original.shape[1], 1))
model = LogisticClassifier(initial_weights)
Improve_list_removing = []

# Run one k-fold cross validation
accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_original, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,                                                          
                                                            random_seed=seed)

print(f"Baseline testing accuracy: {100*baseline_accuracy_bank:.2f} %")
print(f"Baseline iterations: {baseline_iterations_bank}")

for i in features_to_remove:
  print(f"Removing {bank_data.columns[i-1]}...")
  X_bank_remove = np.delete(X_bank_original, i, axis = 1)

  initial_weights = np.zeros((X_bank_remove.shape[1], 1))
  model = LogisticClassifier(initial_weights)

  accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_remove, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,                                                            
                                                            random_seed=seed,)
  print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
  print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
  print(f"Number of iterations: {np.mean(iterations)}") 
  if (np.mean(accuracy_test)>baseline_accuracy_bank):
    Improve_list_removing.append(i)

# Remove all independent features
print(f"Removing all independent features...")
X_bank_remove = np.delete(X_bank_original, features_to_remove, axis=1)

initial_weights = np.zeros((X_bank_remove.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_remove, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,                                                            
                                                            random_seed=seed,)
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 

# Remove accu-improved independent features
print ("List of removing independent feature which accuracy improve",Improve_list_removing)
print (len(Improve_list_removing))
X_bank_remove = np.delete(X_bank_original, Improve_list_removing, axis=1)

initial_weights = np.zeros((X_bank_remove.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_remove, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx=None,                                                            
                                                            random_seed=seed,)
print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_bank):.2f} %")

# Bankruptcy Test 5: Log transform

In [None]:
##########Test block############

print(X_bank_original)
print(X_bank_original.shape)
feature_to_test = range(1,X_bank_log.shape[1]-1)
print(feature_to_test)

# shift all values up to remove negative values
X_bank_norm = X_bank_original.copy()
for i in feature_to_test:
  X_bank_norm[:,i] = X_bank_norm[:,i] + np.absolute(np.min(X_bank_norm[:,i]))

# Test 1: replace selected features with log, no standardization
print(f"----------------------------------------------")
print(f"Test 1: replace selected features with log, w/o standardization")
X_bank_log = X_bank_norm.copy()
X_bank_log_1 = log_transform(X_bank_log, feature_to_test, replace=False, bank=True)

print(X_bank_log_1.shape)
print(X_bank_log_1[:, 80])



In [None]:
# Set random seed, num_fold, num_loops
seed = 10
fold = 10
loops = 1

# Create original set of features
X_bank_original_no_bias = bank_data.iloc[:, :-1].to_numpy()
X_bank_original = np.insert(
    X_bank_original_no_bias, 0, np.ones(X_bank_original_no_bias.shape[0]), axis=1
)
y_bank_original = bank_data.iloc[:, -1].to_numpy().reshape(-1, 1)


feature_to_test = range(1,X_bank_original.shape[1]-1)

# Baseline result

print(f"Baseline testing accuracy: {100*baseline_accuracy_bank:.2f} %")
print(f"Baseline iterations: {baseline_iterations_bank}")

# shift all values up to remove negative values
#X_bank_norm = X_bank_original.copy()
#for i in feature_to_test:
#  X_bank_norm[:,i] = X_bank_norm[:,i] + np.absolute(np.min(X_bank_norm[:,i]))

# Test 1: replace selected features with log, no standardization
print(f"----------------------------------------------")
print(f"Test 1: replace selected features with log, w/o standardization")
X_bank_log = X_bank_original.copy()
X_bank_log_1 = log_transform(X_bank_log, feature_to_test, bank=True)
initial_weights = np.zeros((X_bank_log_1.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_log_1, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            tolerance=1e-6,
                                                            random_seed=seed)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_bank):.2f} %")

# Test 2: replace selected features with log, no standardization
print(f"----------------------------------------------")
print(f"Test 2: replace selected features with log, w/ standardization")
X_bank_log = X_bank_original.copy()
X_bank_log_2 = log_transform(X_bank_log, feature_to_test, bank=True)
initial_weights = np.zeros((X_bank_log_2.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_log_2, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx="all",
                                                            random_seed=seed)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_bank):.2f} %")

# Test 3: replace selected features with log, no standardization
print(f"----------------------------------------------")
print(f"Test 3: append log transform of selected features, w/o standardization")
X_bank_log = X_bank_original.copy()
X_bank_log_3 = log_transform(X_bank_log, feature_to_test, replace=False, bank=True)
initial_weights = np.zeros((X_bank_log_3.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_log_3, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            random_seed=seed)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_bank):.2f} %")

# Test 4: replace selected features with log, with standardization
print(f"----------------------------------------------")
print(f"Test 4: append log transform of selected features, w standardization")
X_bank_log = X_bank_original.copy()
X_bank_log_4 = log_transform(X_bank_log, feature_to_test, replace=False, bank=True)
initial_weights = np.zeros((X_bank_log_4.shape[1], 1))
model = LogisticClassifier(initial_weights)

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_log_4, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops, 
                                                            standardize_idx="all",
                                                            random_seed=seed)


print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_bank):.2f} %")




# Bankruptcy Test 6: Final Results

Test: (Anne 10/11)
1. delete + log append, init 0s, +3.74% (Best??)
2. delete only, init 0s, +3.32%
3. log append only, init 0s, +1.11%
4. delete + log replace, init 0s, +3.71%

In [None]:
features_to_remove = np.array([4, 12, 19, 20, 22, 26, 33, 35, 36, 43, 48, 51, 53, 59, 60, 61, 63])

# Set random seed, num_fold, num_loops
seed = 10
fold = 10
loops = 1

# Baseline result

print(f"Baseline testing accuracy: {100*baseline_accuracy_bank:.2f} %")
print(f"Baseline iterations: {baseline_iterations_bank}")

X_bank_final = X_bank_original.copy()

# Remove features
X_bank_final = np.delete(X_bank_final, features_to_remove, axis=1)
feature_to_test = range(1,X_bank_final.shape[1]-1)
X_bank_final = log_transform(X_bank_final, feature_to_test, replace=False, bank=True)
initial_weights = np.zeros((X_bank_final.shape[1], 1))
model = LogisticClassifier(initial_weights)

tolerance_final=1e-6

accuracy_train, accuracy_test, iterations, weight_store = test_classifier(model, X_bank_final, 
                                                            y_bank_original, 
                                                            num_folds=fold, 
                                                            num_loops=loops,
                                                            standardize_idx=None,
                                                            tolerance=tolerance_final,
                                                            random_seed=seed)

print(f"Training accuracy: {100*np.mean(accuracy_train):.2f} %")
print(f"Testing accuracy: {100*np.mean(accuracy_test):.2f} %")
print(f"Number of iterations: {np.mean(iterations)}") 
print(f"Training accuracy improve: {100*(np.mean(accuracy_test)-baseline_accuracy_bank):.2f} %")
