## Assignment 1
# Group 13
Mathieu Mailhot - Isabel Lougheed - Frank-Lucas Pantazis

Imports

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

In [8]:
"""
- CKD: 28 numerical features, 1 target binary classification variable ("Normal" / "CKD")
- Battery: 32 real-valued features, 2 classes ("Normal" / "Defective")
"""

# load data sets

# Calculate cross entropy or/ Information Gain for all the data without the threshold

# statistical analysis on the datasets

# - normalize

# models: all features, selective features based on statistical analysis (dropping features)


'\n- CKD: 28 numerical features, 1 target binary classification variable ("Normal" / "CKD")\n- Battery: 32 real-valued features, 2 classes ("Normal" / "Defective")\n'

Statistics

In [9]:
# load data sets
df_CKD = pd.read_csv("CKD.csv")
df_battery = pd.read_csv("Battery_Dataset.csv")

# Convert "CKD" to 1 and "Normal" to 0
df_CKD["label"] = df_CKD["label"].replace({"CKD": 1, "Normal": 0})
# Convert "Defective" to 1 and "Normal" to 0
df_battery["label"] = df_battery["label"].replace({"Defective": 1, "Normal": 0})

# Convert to a numpy array
CKD_data = df_CKD.to_numpy()
battery_data = df_battery.to_numpy()

In [5]:
# Statistical Analysis Block

# Class for the analysis
class Stat_analysis:
    def __init__(self, data, name, save_folder):
        self.data = data
        self.name = name
        self.save_folder = save_folder
        self.feature_distribution()
        self.class_distrubution()

    # Function to create a distribution for each feature
    def feature_distribution(self):
        for i in range(self.data.shape[1] - 2): # remove 1 and last column as we do not need them for the distribution of the features
            feature_num = i + 1

            plt.hist(self.data[:,feature_num], bins=20, edgecolor="black")
            plt.xlabel("Value")
            plt.ylabel("Frequency")
            plt.title(f"{self.name} Distribution of Feature {feature_num}")

            filename = os.path.join(self.save_folder, f"{self.name}_feature{feature_num}_distribution.png")
            plt.savefig(filename, dpi=300, bbox_inches="tight")
            plt.close()
    
    # Function to create a distribution for the class
    def class_distrubution(self):
        my_bins = [-0.5, 0.5, 1.5]
        class_0 = "Normal"
        if self.name == "CKD":
            class_1 = "CKD"
        else:
            class_1 = "Defective"

        plt.hist(self.data[:,self.data.shape[1] - 1], bins=my_bins, edgecolor="black", align="mid", rwidth=0.6)
        plt.xticks([0, 1], [class_0, class_1])
        plt.xlabel("Category")
        plt.ylabel("Frequency")
        plt.title(f"{self.name} Distribution of Class")

        filename = os.path.join(self.save_folder, f"{self.name}_class_distribution.png")
        plt.savefig(filename, dpi=300, bbox_inches="tight")
        plt.close()

# Perform the stastical analysis
CKD_stat = Stat_analysis(CKD_data, "CKD", "CKD_distribution")
battery_stat = Stat_analysis(battery_data, "Battery", "Battery_distribution")




In [None]:
class Model:
    def __init__(self,  dataframe, data_array, description:str = None):
        self.max_iteration = 1000
        self.tolerance = 1**-3
        self.step_size = 1

        self.dataframe = dataframe
        self.data_array = data_array
        self.description = description
        
        self.accuracy_arr = 0
        self.avg_accuracy = 0
        
        self.weigth_arr = 0
        self.avg_weigth = 0
    
    def normalize(self, normalize_by_max:bool, standardize:bool): 
        # Normalize the dataset
        # I think you should only normalize by max OR standardize, and I think standardizing would produce better results

        # Separate features and target
        df = self.dataframe # make sure data is a data frame
        features = df.drop(['ID', 'label'], axis=1)
        target = df['label']

        df_norm = df # if normalize_by_max = false and standardize = false, will return original df

        if normalize_by_max:
            # normalizing by extremas, scales to [0,1]
            # ensures data is well-conditioned
            features_normalized = (features - features.min())/(features.max() - features.min())
            df_norm = pd.concat([df[['ID']], features_normalized, df[['label']]], axis=1)

        if standardize:
            # z score normalization, good for gaussian distributions
            # forces std 1 and mean 0
            features_standardized = (features - features.mean())/features.std()
            df_norm = pd.concat([df[['ID']], features_standardized, df[['label']]], axis=1)

        # returns a pandas dataframe
        return df_norm
    
    def crossValidation(self, folds:int): 
        # Split dataset into folds
        # I think that self.data should only include non test data
        data = self.data_array[:,1:] # removing first column (ID)
        fold_size = len(data) // folds
        validation_experiments = []
        train_experiments = []

        for i in range(folds):
            if i==(folds-1):
                # how should i deal with uneven split ??? is it okay for the last fold to be smaller?
                validation_fold = data[(i*fold_size):]  #df.iloc[(i*fold_size):(len(df))]
                train_fold = data[:(i*fold_size)] #pd.concat([df.iloc[:(i*fold_size)], df.iloc[(len(df)):]])

            else:
                validation_fold = data[(i*fold_size):(i*fold_size + fold_size)] #df.iloc[(i*fold_size):(i*fold_size + fold_size)]
                train_fold = np.vstack([data[:(i*fold_size)], data[(i*fold_size + fold_size):]]) #pd.concat([df.iloc[:(i*fold_size)], df.iloc[(i*fold_size + fold_size):]])
            validation_experiments.append(validation_fold)
            train_experiments.append(train_fold)

        # Train
        avg_error = 0

        for i in range(folds):
            # train each training set with fit() to get weights
            train_experiment = train_experiments[i]
            w = self.fit(train_experiment)   

            # get errors 
            validation_experiment = validation_experiments[i]
            error = self.Accu_eval(w, validation_experiment)
            avg_error += error

        avg_error = avg_error/folds

        return avg_error

    def fit(self,train_data):
        
        # Trains using gradient descent: Lecture 5 slide 55-58

        w_prev = np.zeros(train_data.shape[1]) # (number of columns of train_data - 1 to remove label, + 1 to account for bias term)
        w_prev[-1]=1 # bias term
        w_new = np.zeros(len(w_prev))
        w_diff = 1

        delta = np.zeros(len(w_prev))

        for k in range(self.max_iteration):
            
            for row_i in train_data:
                x_i = row_i[:-1] # extracting features
                x_i = np.append(x_i, 1) # because of bias term
                y_i = row_i[-1] # extracting value
                
                delta += (y_i-self.predict(w_prev,x_i))*x_i

            w_new = w_prev - self.step_size*delta
            w_diff = np.linalg.norm(w_prev-w_new)
            
            if (w_diff**2<self.tolerance):
                return w_new
        # Means there was a problem
        return -1

    
    def predict(self,w,x):
        # Predicts output: function at bottom Lecture 5 slide 44
        a = np.dot(w,x)
        return 1/(1+np.exp(-a))

    def Accu_eval(self,w,validate_data): # Used MSE
        # Validation
        error = 0
        for row_i in validate_data:
            x_i = row_i[:-1] # extracting features
            x_i = np.append(x_i, 1) # because of bias term
            y_i = row_i[-1] # extracting value

            error += abs(y_i-self.predict(w,x_i))**2 
        return error/len(validate_data)



In [7]:
# testing whole model for CKD data with cross validation


ckd_model = Model(df_CKD, CKD_data, "This is the model for the CKD dataset")
features_ckd = df_CKD.drop(['ID', 'label'], axis=1)

def condition_number(features):
    A = features.to_numpy()
    U, S, V = np.linalg.svd(A, full_matrices=False)
    condition_num = np.max(S) / np.min(S[np.nonzero(S)])
    return condition_num

# normalize data
df_norm = ckd_model.normalize(True, False)
features_ckd_norm = df_norm.drop(['ID', 'label'], axis=1)
ckd_model_norm = Model(df_norm, df_norm.to_numpy())

# standardize data
df_stand = ckd_model.normalize(False, True)
features_ckd_stand = df_stand.drop(['ID', 'label'], axis=1)
ckd_model_stand = Model(df_stand, df_stand.to_numpy())

print("Condition number of original dataset : ", condition_number(features_ckd))
print("Condition number of normalized dataset : ", condition_number(features_ckd_norm))
print("Condition number of standardized dataset : ", condition_number(features_ckd_stand))
print("---------------------------------------------------------")
print("Error after training original dataset : ", ckd_model.crossValidation(10))
print("Error after training normalized dataset : ", ckd_model_norm.crossValidation(10))
print("Error after training standardized dataset : ", ckd_model_stand.crossValidation(10))


Condition number of original dataset :  1178.126833757448
Condition number of normalized dataset :  16.272627194785652
Condition number of standardized dataset :  1.6816260377340286
---------------------------------------------------------
Error after training original dataset :  [0.3593876  0.33580925 0.36373738 0.36223034 0.36767075 0.35798222
 0.36939785 0.37156406 0.37481658 0.36474501 0.36398977 0.36019274
 0.36937529 0.36753336 0.3627901  0.36210817 0.36464621 0.36861166
 0.3566093  0.36199498 0.47000481 0.49618697 0.48537032 0.49111759
 0.49453358 0.49131355 0.5        0.5        0.30338807]
Error after training normalized dataset :  [0.38319449 0.33351565 0.36394263 0.36236054 0.36750006 0.35858971
 0.36993136 0.3720782  0.3751317  0.36522074 0.3639623  0.36025588
 0.3698658  0.36807489 0.36298735 0.36243639 0.36473984 0.3684353
 0.35724273 0.36406999 0.36278499 0.37029041 0.37244596 0.37130714
 0.36152882 0.36043204 0.35121047 0.35321716 0.30338807]
Error after training standa