## Assignment 1
# Group 13
Mathieu Mailhot - Isabel Lougheed - Frank-Lucas Pantazis

Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os


In [None]:
"""
- CKD: 28 numerical features, 1 target binary classification variable ("Normal" / "battery")
- Battery: 32 real-valued features, 2 classes ("Normal" / "Defective")
"""

# load data sets

# Calculate cross entropy or/ Information Gain for all the data without the threshold

# statistical analysis on the datasets

# - normalize

# models: all features, selective features based on statistical analysis (dropping features)


'\n- CKD: 28 numerical features, 1 target binary classification variable ("Normal" / "CKD")\n- Battery: 32 real-valued features, 2 classes ("Normal" / "Defective")\n'

Statistics

In [5]:
# load data sets
df_CKD = pd.read_csv("CKD.csv")
df_battery = pd.read_csv("Battery_Dataset.csv")

# Convert "CKD" to 1 and "Normal" to 0
df_CKD["label"] = df_CKD["label"].replace({"CKD": 1, "Normal": 0})
# Convert "Defective" to 1 and "Normal" to 0
df_battery["label"] = df_battery["label"].replace({"Defective": 1, "Normal": 0})

# Convert to a numpy array
CKD_data = df_CKD.to_numpy()
battery_data = df_battery.to_numpy()

In [None]:
# Statistical Analysis Block

# Class for the analysis
class Stat_analysis:
    def __init__(self, data, name, save_folder):
        self.data = data
        self.name = name
        self.save_folder = save_folder
        self.feature_distribution()
        self.class_distrubution()

    # Function to create a distribution for each feature
    def feature_distribution(self):
        for i in range(self.data.shape[1] - 2): # remove 1 and last column as we do not need them for the distribution of the features
            feature_num = i + 1

            plt.hist(self.data[:,feature_num], bins=20, edgecolor="black")
            plt.xlabel("Value")
            plt.ylabel("Frequency")
            plt.title(f"{self.name} Distribution of Feature {feature_num}")

            filename = os.path.join(self.save_folder, f"{self.name}_feature{feature_num}_distribution.png")
            plt.savefig(filename, dpi=300, bbox_inches="tight")
            plt.close()
    
    # Function to create a distribution for the class
    def class_distrubution(self):
        my_bins = [-0.5, 0.5, 1.5]
        class_0 = "Normal"
        if self.name == "CKD":
            class_1 = "CKD"
        else:
            class_1 = "Defective"

        plt.hist(self.data[:,self.data.shape[1] - 1], bins=my_bins, edgecolor="black", align="mid", rwidth=0.6)
        plt.xticks([0, 1], [class_0, class_1])
        plt.xlabel("Category")
        plt.ylabel("Frequency")
        plt.title(f"{self.name} Distribution of Class")

        filename = os.path.join(self.save_folder, f"{self.name}_class_distribution.png")
        plt.savefig(filename, dpi=300, bbox_inches="tight")
        plt.close()

# Perform the stastical analysis
CKD_stat = Stat_analysis(CKD_data, "CKD", "CKD_distribution")
battery_stat = Stat_analysis(battery_data, "Battery", "Battery_distribution")




In [None]:
class Model:
    def __init__(self,  dataframe, data_array, description:str = None):
        
        # Hyperparameter variables for Adam Gradient Descent Algorithm
        self.max_iteration = 10000
        self.tolerance = 10**-4
        self.step_size = 0.001
        self.b_1 = 0.99
        
        # Data variables
        self.dataframe = dataframe
        self.data_array = data_array
        self.description = description
        
        # Training and Validation variables
        self.accuracy_arr = []
        self.avg_accuracy = 0
        
        self.weigth_arr = []
        self.avg_weigth = 0
    
    def normalize(self, normalize_by_max:bool, standardize:bool): 
        # Normalize the dataset
        # I think you should only normalize by max OR standardize, and I think standardizing would produce better results

        # Separate features and target
        df = self.dataframe # make sure data is a data frame
        features = df.drop(['ID', 'label'], axis=1)
        target = df['label']

        df_norm = df # if normalize_by_max = false and standardize = false, will return original df

        if normalize_by_max:
            # normalizing by extremas, scales to [0,1]
            # ensures data is well-conditioned
            features_normalized = (features - features.min())/(features.max() - features.min())
            df_norm = pd.concat([df[['ID']], features_normalized, df[['label']]], axis=1)

        if standardize:
            # z score normalization, good for gaussian distributions
            # forces std 1 and mean 0
            features_standardized = (features - features.mean())/features.std()
            df_norm = pd.concat([df[['ID']], features_standardized, df[['label']]], axis=1)

        # returns a pandas dataframe
        return df_norm
    
    def crossValidation(self, folds:int): 
        # Split dataset into folds
        # I think that self.data should only include non test data
        data = self.data_array[:,1:] # removing first column (ID)
        fold_size = len(data) // folds
        validation_experiments = []
        train_experiments = []

        for i in range(folds):
            if i==(folds-1):
                # how should i deal with uneven split ??? is it okay for the last fold to be smaller?
                validation_fold = data[(i*fold_size):]  #df.iloc[(i*fold_size):(len(df))]
                train_fold = data[:(i*fold_size)] #pd.concat([df.iloc[:(i*fold_size)], df.iloc[(len(df)):]])

            else:
                validation_fold = data[(i*fold_size):(i*fold_size + fold_size)] #df.iloc[(i*fold_size):(i*fold_size + fold_size)]
                train_fold = np.vstack([data[:(i*fold_size)], data[(i*fold_size + fold_size):]]) #pd.concat([df.iloc[:(i*fold_size)], df.iloc[(i*fold_size + fold_size):]])
            validation_experiments.append(validation_fold)
            train_experiments.append(train_fold)

        # Train
        avg_error1 = 0
        avg_error2 = 0
        np.array
        for i in range(folds):
            # train each training set with fit() to get weights
            train_experiment = train_experiments[i]
            w = self.fit(train_experiment)
            self.weigth_arr.append(w.tolist())
            # get errors 
            validation_experiment = validation_experiments[i]
            error1,error2 = self.Accu_eval(w, validation_experiment) 
            
            avg_error1 += error1
            avg_error2 += error2
        
        self.avg_weigth = np.mean(np.array(self.weigth_arr),axis = 0)
        
  
        avg_error1 = avg_error1/folds
        avg_error2 = avg_error2/folds
        return (avg_error1,avg_error2)

    def fit(self,train_data):
        
        # Trains using gradient descent: Lecture 5 slide 55-58

        w_prev = np.ones(train_data.shape[1])*0.1 # (number of columns of train_data - 1 to remove label, + 1 to account for bias term)
        m = 0
        for epoch in range(self.max_iteration):
            delta = np.zeros(len(w_prev))

            for row_i in train_data:
                y_i = row_i[-1] # extracting value
                x_i = row_i.copy() # extracting features
                x_i[-1] = 1 # because of bias term
                delta += x_i*(y_i-self.predict(w_prev,x_i))

            m = self.b_1 * m + (1-self.b_1) * delta
            #w_new = w_prev + self.step_size/(1+0.5*epoch) * delta
            w_new = w_prev + self.step_size * m  # to be able to reach better precision need to devide step size by num iteration
       
            w_diff = np.linalg.norm(w_new - w_prev)
            
            w_prev = w_new
            
            if (w_diff**2<self.tolerance):
                return w_new
           
            
        # Means there was a problem
        print("Not converged", w_new , w_diff)
        return w_new

    
    def predict(self,w,x):
        # Predicts output: function at bottom Lecture 5 slide 44
        a = w.T @ x
        return 1/(1+np.exp(-np.clip(a, -500, 500)))

    def Accu_eval(self,w,validate_data): # Used MSE
        # Validation
        error = 0
        correct_prediction = 0
        for row_i in validate_data:
            y_i = row_i[-1] # extracting value
            x_i = row_i.copy() # extracting features
            x_i[-1] = 1
            
            error += (y_i-self.predict(w,x_i))**2 
            if ((y_i-self.predict(w,x_i))<=0.5):
                correct_prediction+=1
            
        return (error/len(validate_data), 1-correct_prediction/len(validate_data))



In [511]:
# testing whole model for CKD data with cross validation


ckd_model = Model(df_CKD, CKD_data, "This is the model for the CKD dataset")
features_ckd = df_CKD.drop(['ID', 'label'], axis=1)

def condition_number(features):
    A = features.to_numpy()
    U, S, V = np.linalg.svd(A, full_matrices=False)
    condition_num = np.max(S) / np.min(S[np.nonzero(S)])
    return condition_num

# normalize data
df_norm = ckd_model.normalize(True, False)
features_ckd_norm = df_norm.drop(['ID', 'label'], axis=1)
ckd_model_norm = Model(df_norm, df_norm.to_numpy())

# standardize data
df_stand = ckd_model.normalize(False, True)
features_ckd_stand = df_stand.drop(['ID', 'label'], axis=1)
ckd_model_stand = Model(df_stand, df_stand.to_numpy())

#print("Condition number of original dataset : ", condition_number(features_ckd))
print("Condition number of normalized dataset : ", condition_number(features_ckd_norm))
print("Condition number of standardized dataset : ", condition_number(features_ckd_stand))
print("---------------------------------------------------------")
#print("Error after training original dataset : ", ckd_model.crossValidation(10))
print("Error after training normalized dataset : ", ckd_model_norm.crossValidation(10), ckd_model_norm.weigth_arr)

print("Error after training standardized dataset : ", ckd_model_stand.crossValidation(10),ckd_model_stand.avg_weigth)


Condition number of normalized dataset :  16.272627194785652
Condition number of standardized dataset :  1.6816260377340286
---------------------------------------------------------
Error after training normalized dataset :  (0.21288291093027514, 0.16666666666666666) [[-1.4184910875135412, 5.941666800891609, -0.7497868537664139, -0.3891041293756842, -0.20168489809480525, -0.09618833493293504, -0.22916765879222198, -1.3086308676863982, -0.9116986771652039, -0.3429581726790658, 0.25574888369639, 0.8108475023017744, -0.0316494839741422, -0.41351744818039926, -0.6456948109709769, -0.08787931528347197, 0.32234676544124596, 0.299845276026127, 0.5469430252355622, 0.4719236142378602, 0.23380917791387984, -1.118943148246078, -0.5111627979214928, 0.48197642163212057, -0.39961051953744564, 0.18682503386410923, 0.6030978220210198, 0.37598101179493065, -0.5566734155775546], [-1.5642248078796603, 5.230198869739613, -0.19805928497409928, -0.13156802285680028, -0.10866795420122584, -0.0446718078217331

In [512]:
battery_model = Model(df_battery, battery_data, "This is the model for the battery dataset")
features_battery = df_battery.drop(['ID', 'label'], axis=1)

def condition_number(features):
    A = features.to_numpy()
    U, S, V = np.linalg.svd(A, full_matrices=False)
    condition_num = np.max(S) / np.min(S[np.nonzero(S)])
    return condition_num

# normalize data
df_norm = battery_model.normalize(True, False)
features_battery_norm = df_norm.drop(['ID', 'label'], axis=1)
battery_model_norm = Model(df_norm, df_norm.to_numpy())

# standardize data
df_stand = battery_model.normalize(False, True)
features_battery_stand = df_stand.drop(['ID', 'label'], axis=1)
battery_model_stand = Model(df_stand, df_stand.to_numpy())

#print("Condition number of original dataset : ", condition_number(features_battery))
print("Condition number of normalized dataset : ", condition_number(features_battery_norm))
print("Condition number of standardized dataset : ", condition_number(features_battery_stand))
print("---------------------------------------------------------")
#print("Error after training original dataset : ", battery_model.crossValidation(10))
print("Error after training normalized dataset : ", battery_model_norm.crossValidation(10), battery_model_norm.avg_weigth)
print("Error after training standardized dataset : ", battery_model_stand.crossValidation(10),battery_model_stand.avg_weigth)

Condition number of normalized dataset :  17.632795820381226
Condition number of standardized dataset :  1.6891992219130847
---------------------------------------------------------


KeyboardInterrupt: 