# Final Project: Perceptron Algorithm

## Name: <span style="color:blue"> *Josiah Hemphill* </span>

## Utils

In [4]:
from typing import List, Dict, Tuple, Callable
import os
import gc
import traceback
import warnings
from pdb import set_trace

import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
import pandas as pd

vgsales_df = pd.read_csv("vgsales.csv")
feature_names = vgsales_df.columns

In [7]:
def binarize_classes(
    X: np.ndarray,
    y: np.ndarray,
    pos_class: List[int], 
    neg_class: List[int]
) -> Tuple[np.ndarray, np.ndarray]:
    """ Converts data into a one-vs-all or one-vs-one prolbem
        according to labels passed as pos or neg.

        Args:
            X: Input data given as matrix

            y: Labels corresponding to input data given as 1D vector

            pos_class: list of labels that will be used for 
                the positive class.

            neg_class: list of labels that will be used for 
                the negative class.

        Returns:
            Two arrays where the 1st corresponds to the data
            given as a matrix and the 2nd corresponds to the
            new labels given as a 1D vector.
    """
    X = X.copy()
    y = y.copy()
    
    # TODO 2.1
    X_pos, y_pos = None, None
    pos_locs = np.isin(y, pos_class)
    X_pos = X[pos_locs]
    y_pos = y[pos_locs]
    y_pos[:] = 1
    
    # TODO 2.2
    X_neg, y_neg = None, None
    neg_locs = np.isin(y, neg_class)
    X_neg = X[neg_locs]
    y_neg = y[neg_locs]
    y_neg[:] = -1
    
    return np.vstack([X_pos, X_neg]), np.hstack([y_pos, y_neg])

In [8]:
from sklearn.model_selection import train_test_split
def get_train_valid_test_data(
    X: np.ndarray, 
    y: np.ndarray, 
):
    """ Randomizes and then splits the data into train, validation, and test sets.

        Args:
            X: Data given as a 2D matrix

            y: Labels given as a vector 
    """
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, train_size=.8, random_state=42)
    X_trn, X_vld, y_trn, y_vld = train_test_split(X_trn, y_trn, train_size=.8, random_state=42)

    return X_trn, y_trn, X_vld, y_vld, X_tst, y_tst

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
def get_preprocessed_data(pos_class: List[int], neg_class: List[int])  -> Tuple[np.ndarray]:
    """ Gets preprocessed data for training, validation, and testing

        Args:
            pos_class: list of labels that will be used for 
                the positive class.

            neg_class: list of labels that will be used for 
                the negative class.

        Return:
            A tuple of NumPy arrays where indices 0-1 
            contain the training data/targets, indices 2-3
            contain the validation data/targets, and 4-5
            contain the testing data/targets.
    """
    # TODO 3.1
    X, y = binarize_classes(iris.data.values, iris.target.values, pos_class, neg_class)
    
    # TODO 3.2
    X_trn, y_trn, X_vld, y_vld, X_tst, y_tst= get_train_valid_test_data(X, y)
    

    # TODO 3.3
    scaler.fit(X_trn)
    X_trn = scaler.transform(X_trn)
    X_vld = scaler.transform(X_vld)
    X_tst = scaler.transform(X_tst)
    
    # TODO 3.4
    m_samples = len(X_trn)
    bias = np.ones((m_samples, 1))
    X_trn = np.hstack([bias, X_trn])

    m_samples = len(X_tst)
    bias = np.ones((m_samples, 1))
    X_tst = np.hstack([bias, X_tst])

    m_samples = len(X_vld)
    bias = np.ones((m_samples, 1))
    X_vld = np.hstack([bias, X_vld])
    

    # Reshape targets to be 2D column vectors
    return X_trn, y_trn.reshape(-1, 1), X_vld, y_vld.reshape(-1, 1), X_tst, y_tst.reshape(-1, 1)

In [10]:
def accuracy(y: np.ndarray, y_hat: np.ndarray) -> float:
    """ Computes the accuracy between two 1D vectors

        Args:
            y: Ground truth labels given as a 1D vector

            y_hat: Predicted labels given as a 1D vector

        Return:
            A float corresponding to the accuracy
    """
    y =  y.flatten() # reshape to make 1D vector for consistency
    y_hat = y_hat.flatten() # reshape to make 1D vector for consistency

    # TODO 4
    y_hat == y
    total_correct = np.sum(y_hat == y)
    accuracy = total_correct / len(y)
    
    return accuracy

In [11]:
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(
    y: np.ndarray, 
    y_hat: np.ndarray, 
    class_name_key: Dict[int, str] = None
) -> pd.DataFrame:
    """ Plots a pretty and labeld version of Sklerarn's confusion matrix

        Args:
            y: Ground truth labels given as a 1D vector

            y_hat: Predicted labels given as a 1D vector

            class_name_key: A dictionary where each key corresponds to 
                a label in y and the values corresponding to string name 
                for said label. This name will be displayed when plotting.

        Returns:
            A confusion matrix casted as a DataFrame
    """
    y =  y.flatten() # reshape to make 1D vector for consistency
    y_hat = y_hat.flatten() # reshape to make 1D vector for consistency
    
    # TODO 5
    cfm = confusion_matrix(y, y_hat)
    
    
    labels = np.sort(np.unique(y))
    if class_name_key is not None:
        classes = []
        for l in labels:
            class_name = class_name_key.get(l, l)
            classes.append(class_name)
        labels = classes
        
    columns, index = labels, labels
    cfm_df = pd.DataFrame(cfm, index=index, columns=columns)
    sns.heatmap(cfm_df, annot=True)

    return cfm_df

In [12]:
def ppv(y: np.ndarray, y_hat: np.ndarray) -> float:
    """ Compute the PPV or precision score

        Args:
            y: Ground truth labels given as a 1D vector

            y_hat: Predicted labels given as a 1D vector

        Returns:
            A float corresponding to the PPV value.
    """
    y =  y.flatten() # reshape to make 1D vector for consistency
    y_hat = y_hat.flatten() # reshape to make 1D vector for consistency
    
    # TODO 6
    tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel()
    ppv = tp / (tp + fp)
    return ppv

In [13]:
def tpr(y: np.ndarray, y_hat: np.ndarray) -> float:
    """ Compute the TPR or recall score

        Args:
            y: Ground truth labels given as a 1D vector

            y_hat: Predicted labels given as a 1D vector

        Returns:
            A float corresponding to the TPR value.
    """
    # TODO 7
    tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel()
    tpr = tp / (tp + fn)
    return tpr

In [14]:
def tnr(y: np.ndarray, y_hat: np.ndarray) -> float:
    """ Compute the TNR or specificity score

        Args:
            y: Ground truth labels given as a 1D vector

            y_hat: Predicted labels given as a 1D vector

        Returns:
            A float corresponding to the TNR value.
    """
    # TODO 8
    tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel()
    tnr = tn / (tn + fp)
    return tnr

In [15]:
class Perceptron():
    """ Performs binary classification using Rosenblatt's perceptron
    
        Attributes:

            alpha: learning rate or step size
 
            epochs: Number of epochs to run for mini-batch
                gradient descent
                
            seed: Seed to be used for NumPy's RandomState class
                or universal seed np.random.seed() function.

            w: Vector of weights 

            trn_acc: List that stores training accuracy for each epoch.

            vld_acc: List that stores validation accuracy for each epoch.
    """
    def __init__(
        self, 
        alpha: float,
        seed: int = 0,
        epochs: int = 1,
    ):
        self.alpha = alpha
        self.epochs = epochs
        self.seed = seed
        self.w = None
        self.trn_acc = None
        self.vld_acc = None

    def fit(
         self, X: np.ndarray, 
         y: np.ndarray, 
         X_vld: np.ndarray=None, 
         y_vld: np.ndarray=None
     ) -> object:
        """ Train the perceptron to find optimal weights

            Args:
                X: Training data given as a 2D matrix

                y: Training labels given as a 2D column vector
                
            Returns:
                The class's own object reference. 
        """
        np.random.seed(self.seed) # Set seed for reproducibility
        self.trn_acc = []
        self.vld_acc = []
        # TODO 9.1 - 9.2
        self.w = np.random.rand(X.shape[1])
        for e in range(self.epochs):
            misclassified = 0
            for m in range(X.shape[0]):
                z = X[m].T @ self.w
                y_hat = np.sign(z)
                if (y_hat != y[m]):
                    misclassified += 1
                    self.w = self.w + self.alpha * y[m] * X[m]
            
            trn_preds = self.predict(X)
            trn_acc = accuracy(y, trn_preds)
            self.trn_acc.append(trn_acc)

            if X_vld is not None and y_vld is not None:
                vld_preds = self.predict(X_vld)
                vld_acc = accuracy(y_vld, vld_preds)
                self.vld_acc.append(vld_acc)
            
            if (misclassified == 0):
                break
        return self 
        
    def predict(self, X: np.ndarray) -> np.ndarray:
        """ Make predictions using learned weights

            Args:
                X: Testing data given as a 2D matrix

            Returns:
                A 2D column vector of predictions for each data sample in X
        """
        # TODO 9.3
        y_hat = np.sign(X @ self.w)
        return y_hat