<a href="https://colab.research.google.com/github/Helda05/Models/blob/main/ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
abcd

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, roc_auc_score
import numpy as np
from abc import ABC, abstractmethod
import pandas as pd
from sklearn.preprocessing import LabelEncoder

class Preprocessor:
    def __init__(self, df):
        # Initialize the preprocessor with a DataFrame
        self.df = df
        self.numerical_features = ['F{}'.format(i) for i in range(1, 18)]
        self.boolean_features = ['F{}'.format(i) for i in range(18, 78)]

    def preprocess(self):
        # Apply various preprocessing methods on the DataFrame
        self.df = self._preprocess_numerical(self.df)
        self.df = self._preprocess_categorical(self.df)
        return self.df

    def _preprocess_numerical(self, df):
        # Custom logic for preprocessing numerical features goes here

        # Impute missing values with median of each col
        median = df[self.numerical_features].median()
        df[self.numerical_features] = df[self.numerical_features].fillna(median)

        for feature in self.numerical_features:
          # Clip extreme values to 1st and 99th percentiles
          percentiles = df[feature].quantile([0.01, 0.99]).values
          df[feature] = df[feature].clip(percentiles[0], percentiles[1])


          mean = df[feature].mean()
          std_dev = df[feature].std()

          if std_dev != 0:
                # Z-score scaling
              df[feature] = (df[feature] - mean) / std_dev
          else:
                # Set all values to a default value
              df[feature] = 0

        return df


    def _preprocess_categorical(self, df):
        # Add custom logic here for categorical features

        #impute Nan with mode of each col
        mode = df[self.boolean_features].mode()
        df[self.boolean_features] = df[self.boolean_features].fillna(mode).iloc[0]
        return df

    def _preprocess_ordinal(self, df):
        # Custom logic for preprocessing ordinal features goes here
        return df

# Base classifier class
class Classifier(ABC):
    @abstractmethod
    def fit(self, X, y):
        # Abstract method to fit the model with features X and target y
        pass

    @abstractmethod
    def predict(self, X):
        # Abstract method to make predictions on the dataset X
        pass

# K-Nearest Neighbors Classifier
class KNearestNeighbors(Classifier):
    def __init__(self, k=3):
        # Initialize KNN with k neighbors
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        # Store training data and labels for KNN
        self.X_train = X
        self.y_train = y
        pass

    def predict(self, X):
        # Implement the prediction logic for KNN
        predictions = []
        for pt in X.values:
          distances = [np.sqrt(np.sum((x - pt)**2)) for x in self.X_train.values]
          sorted_indices = np.argsort(distances)[:self.k] #sort in ascending + extract indices of first 3 neighbours
          predicted_outcome = self.y_train.loc[sorted_indices] #extract predicted outcomes
          #extract outcome with highest count
          prediction = max(set(predicted_outcome), key=predicted_outcome.tolist().count)
          predictions.append(prediction)
        return np.array(predictions)



    def predict_proba(self, X):
        # Implement probability estimation for KNN
        probabilities = []
        for test_point in X.values:
            distances = [np.sqrt(np.sum((train_point - test_point)**2)) for train_point in self.X_train.values]
            sorted_indices = np.argsort(distances)[:self.k]
            predicted_outcome = self.y_train[sorted_indices]

            # Calculate probabilities
            class_counts = {}

            for pred in predicted_outcome:
                class_counts[pred] = class_counts.get(pred, 0) + 1

            target_probabilities = {class_: count / len(predicted_outcome) for class_, count in class_counts.items()}

            # Ensure all classes have a probability entry, even if it's zero
            all_classes = set(self.y_train)
            for class_ in all_classes:
                if class_ not in target_probabilities:
                    target_probabilities[class_] = 0.0

            probabilities.append(list(target_probabilities.values()))

        return np.array(probabilities)

# Multilayer Perceptron Classifier
class MultilayerPerceptron():
    def __init__(self, hidden_layers_sizes = [77, 10, 1], learning_rate = 0.001, epochs = 100):
        # Initialize MLP with given network structure
        self.hidden_layers_sizes = hidden_layers_sizes
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.variables = {'w0': None, 'b0': None, 'w1': None, 'b1': None, 'o1': None, 'o2': None, 'a1': None}
        self.X = None
        self.y = None
        # self.input_size = None
        pass



    def initialise_weights(self):
        np.random.seed(1)
        input_size = self.X.shape[1]

        # Xavier/Glorot initialization for the first hidden layer
        self.variables["w0"] = np.random.randn(self.hidden_layers_sizes[0], self.hidden_layers_sizes[1]) * np.sqrt(2 / (input_size + self.hidden_layers_sizes[0]))
        self.variables["b0"] = np.random.randn(self.hidden_layers_sizes[1],)

        # Xavier/Glorot initialization for the second hidden layer
        self.variables["w1"] = np.random.randn(self.hidden_layers_sizes[1], self.hidden_layers_sizes[2]) * np.sqrt(2 / (self.hidden_layers_sizes[0] + self.hidden_layers_sizes[1]))
        self.variables["b1"] = np.random.randn(self.hidden_layers_sizes[2],)

    def fit(self, X, y):
        # Implement training logic for MLP including forward and backward propagation
        self.X = X
        self.y = y

        #initialise weights
        self.initialise_weights()

        #perform forward and backward propagation
        for i in range(self.epochs):
            self._backward_propagation(self._forward_propagation(X))

        pass

    def predict(self, X):
        # Implement prediction logic for MLP

        a1 = self.relu_activation(X.dot(self.variables["w0"]) + self.variables["b0"])
        prob_pred = self._sigmoid_func(a1.dot(self.variables["w1"]) + self.variables["b1"])
        binary_output = [int(np.round(item[0])) for item in prob_pred.values]
        return binary_output


    def predict_proba(self, X):
        # Implement probability estimation for MLP

        a1 = self.relu_activation(X.dot(self.variables["w0"]) + self.variables["b0"])
        prob_output = self._sigmoid_func(a1.dot(self.variables["w1"]) + self.variables["b1"])
        return prob_output


    def _forward_propagation(self,X):
        # Implement forward propagation for MLP

        o1 = self.variables["b0"] + self.X @ (self.variables["w0"]) #calc weighted sum for first hidden layer
        a1 = self.relu_activation(o1) #apply ReLU activation function
        o2 = self.variables["b1"] + a1 @ (self.variables["w1"]) #weighted sum for output layer
        output = self._sigmoid_func(o2) #apply sigmoid activation function

        #store for use in backpropagation
        self.variables["o1"] = o1
        self.variables["a1"] = a1
        self.variables["o2"] = o2

        return output


    def _backward_propagation(self, predicted_output):
        # Implement backward propagation for MLP

        #transformation for calculation of gradient
        true_labels = self.y.to_numpy().reshape(predicted_output.shape)

        # Compute gradients with respect to the output layer
        sigmoid_derivative = predicted_output * (1 - predicted_output)
        loss_gradient_output = np.divide((1 - true_labels),np.maximum(1 - predicted_output, 1e-10)) - np.divide(true_labels, np.maximum(predicted_output,1e-10))
        output_layer_gradient = sigmoid_derivative * loss_gradient_output

        # Compute gradients with respect to the first hidden layer
        hidden_layer1_gradient = output_layer_gradient @ self.variables["w1"].T
        weight2_gradient = self.variables["a1"].T @ output_layer_gradient
        bias2_gradient = np.sum(output_layer_gradient, axis=0)

        #update weights and biases
        self.variables['w1'] -= weight2_gradient * self.learning_rate
        self.variables['b1'] -= bias2_gradient * self.learning_rate

        # Compute gradients with respect to the input layer
        hidden_layer1_derivative = self.relu_derivative(self.variables['o1']) * hidden_layer1_gradient
        weight1_gradient = self.X.T @ (hidden_layer1_derivative)
        bias1_gradient = np.sum(hidden_layer1_derivative, axis=0)

        # Update the weights and biases
        self.variables['w0'] -= weight1_gradient * self.learning_rate
        self.variables['b0'] -= bias1_gradient * self.learning_rate

        pass

    def relu_activation(self, weighted_sum):
        return np.maximum(0, weighted_sum)

    def relu_derivative(self, x):
        return np.where(x > 0, 1, 0)

    def _sigmoid_func(self, x):
        return 1/(np.exp(-x)+1)


class NaiveBayesClassifier(Classifier):
    def __init__(self):
        self.numerical_features = ['F{}'.format(i) for i in range(1, 18)]
        self.binary_features = ['F{}'.format(i) for i in range(18, 78)]
        self.alpha = 1  # Laplace smoothing parameter
        self.gaussian_params = {}  # Parameters for Gaussian features
        self.bernoulli_params = {}  # Parameters for Bernoulli features
        self.class_probs = {}  # Class probabilities

    def fit(self, X, y):
        # Calculate class probabilities
        num_samples = len(y)
        num_classes = len(set(y))

        for i in set(y):
            filter = (y == i)
            self.class_probs[i] = (sum(filter) + self.alpha) / (num_samples + num_classes * self.alpha) #laplace smoothing to avoid 0 probabilities

            # Fit Gaussian model on Gaussian features
            for feature in self.numerical_features:
                mean = X.loc[filter, feature].mean()
                std = X.loc[filter, feature].std()
                self.gaussian_params[(i, feature)] = (mean, std)

            # Fit Bernoulli model on Bernoulli features
            for feature in self.binary_features:
                prob_true = (X.loc[filter, feature].sum() + self.alpha) / (sum(filter) + 2 * self.alpha) #laplace smoothing
                self.bernoulli_params[(i, feature)] = prob_true


    def predict(self, X):
        predictions = []

        for nil, instance in X.iterrows():
            class_probs = {}

            #initialise log prob
            for p in self.class_probs:
                gaussian_log_prob = 0
                bernoulli_log_prob = 0

                # Calculate Gaussian log probabilities
                for feature in self.numerical_features:
                    mean, std = self.gaussian_params[(p, feature)]
                    gaussian_log_prob += self.gaussian_log_probability(instance[feature], mean, std)

                # Calculate Bernoulli log probabilities
                for feature in self.binary_features:
                    prob_true = self.bernoulli_params[(p, feature)]
                    bernoulli_log_prob += self.bernoulli_log_probability(instance[feature], prob_true)

                # Calculate overall log probability for the class
                class_probs[p] = np.log(self.class_probs[p]) + gaussian_log_prob + bernoulli_log_prob

            # Predict the class with the highest probability
            prediction = max(class_probs, key=class_probs.get)
            predictions.append(prediction)

        return predictions

    def predict_proba(self, X):
        probabilities = []

        for nil, instance in X.iterrows():
            class_probs = {}

            #initialise log prob
            for i in self.class_probs:
                gaussian_log_prob = 0
                bernoulli_log_prob = 0

                # Calculate Gaussian log probabilities
                for feature in self.numerical_features:
                    mean, std = self.gaussian_params[(i, feature)]
                    gaussian_log_prob += self.gaussian_log_probability(instance[feature], mean, std)

                # Calculate Bernoulli log probabilities
                for feature in self.binary_features:
                    prob_true = self.bernoulli_params[(i, feature)]
                    bernoulli_log_prob += self.bernoulli_log_probability(instance[feature], prob_true)

                # Calculate overall log probability for the class
                class_probs[i] = np.log(self.class_probs[i]) + gaussian_log_prob + bernoulli_log_prob

            # Normalize the probabilities
            exp_probs = np.exp(list(class_probs.values()))
            probabilities.append(exp_probs / np.sum(exp_probs))

        return np.array(probabilities)

    def gaussian_log_probability(self, x, mean, std):
        exponent = -((x - mean) ** 2) / (2 * (std ** 2))
        return exponent + -0.5 * np.log(2 * np.pi * (std ** 2))

    def bernoulli_log_probability(self, x, prob_true):
        return (1 - x) * np.log(1 - prob_true) + x * np.log(prob_true)



# Function to evaluate the performance of the model
def evaluate_model(model, X_test, y_test):
    # Predict using the model and calculate various performance metrics
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    precision = precision_score(y_test, predictions, zero_division=1)
    recall = recall_score(y_test, predictions)
    mcc = matthews_corrcoef(y_test, predictions)

    # Check if the model supports predict_proba method for AUC calculation
    if hasattr(model, 'predict_proba'):
        proba = model.predict_proba(X_test)

    #     if isinstance(proba, pd.DataFrame):
    #         proba = proba.to_numpy()#         if len(np.unique(y_test)) == 2:  # Binary classification
    #         auc = roc_auc_score(y_test, proba[:, 1])
    #     else:  # Multiclass classification
    #         auc = roc_auc_score(y_test, proba, multi_class='ovo')
    # else:
    #     auc = None

        if len(np.unique(y_test)) == 2:  # Binary classification
             # Ensure y_test is binary
            if proba.ndim > 1 and proba.shape[1] > 1:
                # # Convert y_test to binary format, specifying the positive class label
                if len(np.unique(y_test)) > 2:
                    y_test = (y_test == positive_class_label).astype(int)  # Specify the positive class label
                auc = roc_auc_score(y_test, proba[:, 1])
            else:
                auc = roc_auc_score(y_test, proba)


        else:  # Multiclass classification
            # Ensure y_test is not multilabel
            if len(np.unique(y_test)) > 2:
                y_test = (y_test == positive_class_label).astype(int)  # Specify the positive class label
            auc = roc_auc_score(y_test, proba, multi_class='ovo')




    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mcc': mcc,
        'auc': auc
    }

# Main function to execute the pipeline
def main():
    # Load trainWithLable data
    df = pd.read_csv('/content/drive/My Drive/ML project/trainWithLabel.csv')

    # Preprocess the training data
    preprocessor = Preprocessor(df)
    df_processed = preprocessor.preprocess()

    # Define the models for classification
    models = {'Naive Bayes': NaiveBayesClassifier(),
              'KNN': KNearestNeighbors(),
              'MLP': MultilayerPerceptron()
    }

    # Split the dataset into features and target variable
    X_train = df_processed.drop('Outcome', axis=1)
    y_train = df_processed['Outcome']
    df_processed['Outcome'] = df_processed['Outcome'].astype('category')

    # Reset the index to ensure it starts from 0 and is sequential
    X_train = X_train.reset_index(drop=True)

    # Perform K-Fold cross-validation
    kf = KFold(n_splits=10, random_state=42, shuffle=True)
    cv_results = []

    for model_name, model in models.items():
        for fold_idx, (train_index, val_index) in enumerate(kf.split(X_train), start=1):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
            X_train_fold  = X_train_fold.reset_index(drop=True)
            X_val_fold  = X_val_fold.reset_index(drop=True)
            y_val_fold  = y_val_fold.reset_index(drop=True)
            y_train_fold  = y_train_fold.reset_index(drop=True)


            model.fit(X_train_fold, y_train_fold)
            fold_result = evaluate_model(model, X_val_fold, y_val_fold)
            fold_result['model'] = model_name
            fold_result['fold'] = fold_idx
            cv_results.append(fold_result)


    # Convert CV results to a DataFrame and calculate averages
    cv_results_df = pd.DataFrame(cv_results)
    avg_results = cv_results_df.groupby('model').mean().reset_index()
    avg_results['model'] += ' Average'
    all_results_df = pd.concat([cv_results_df, avg_results], ignore_index=True)

    # Adjust column order and display results
    all_results_df = all_results_df[['model', 'accuracy', 'f1', 'precision', 'recall', 'mcc', 'auc']]

    print("Cross-validation results:")
    print(all_results_df)

    # Save results to an Excel file
    all_results_df.to_excel('cv_results.xlsx', index=False)
    print("Cross-validation results with averages saved to cv_results.xlsx")

    # Load the test dataset, assuming you have a test set CSV file without labels
    df_ = pd.read_csv('/content/drive/My Drive/ML project/testWithoutLabel.csv')
    preprocessor_ = Preprocessor(df_)
    X_test = preprocessor_.preprocess()


    # Initialize an empty list to store the predictions of each model
    predictions = []


    # Make predictions with each model
    for name, model in models.items():
        model_predictions = model.predict(X_test)
        predictions.append({
            'model': name,
            'predictions': model_predictions
        })

    # Convert the list of predictions into a DataFrame
    predictions_df = pd.DataFrame(predictions)

    # Print the predictions
    print("Model predictions:")
    print(predictions_df)

    # Save the predictions to an Excel file
    predictions_df.to_csv('test_results.csv', index=False)
    print("Model predictions saved to test_results.xlsx")

if __name__ == "__main__":
    main()


Cross-validation results:
                  model  accuracy        f1  precision    recall       mcc  \
0           Naive Bayes  0.805556  0.533333   0.500000  0.571429  0.412677   
1           Naive Bayes  0.805556  0.533333   0.500000  0.571429  0.412677   
2           Naive Bayes  0.694444  0.592593   0.500000  0.727273  0.377552   
3           Naive Bayes  0.777778  0.636364   0.700000  0.583333  0.482382   
4           Naive Bayes  0.611111  0.300000   0.333333  0.272727  0.034816   
5           Naive Bayes  0.805556  0.666667   0.636364  0.700000  0.531050   
6           Naive Bayes  0.805556  0.666667   0.875000  0.538462  0.571876   
7           Naive Bayes  0.666667  0.454545   0.454545  0.454545  0.214545   
8           Naive Bayes  0.885714  0.750000   0.666667  0.857143  0.686406   
9           Naive Bayes  0.742857  0.640000   0.800000  0.533333  0.474693   
10                  KNN  0.833333  0.250000   1.000000  0.142857  0.344046   
11                  KNN  0.750000  0.4