## Machine Learning Project 2

In [1]:
# scikit-learn pereptron and adaline implementations
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
from typing import Optional

In [2]:
data = pd.read_csv('/Users/jaxen1/Shared/C/Machine Learning/project2/project_adult.csv')
validation = pd.read_csv('/Users/jaxen1/Shared/C/Machine Learning/project2/project_validation_inputs.csv')

## Data Preprocessing

In [3]:
# Function to preprocess data
def preprocess_data(df):
    # Take out educational since is ordinal, ordinal var is already in dataset
    df = df.drop(columns = ['Unnamed: 0', 'education'])
    # Handle missing values
    df = df.dropna()
    # Replace all values unknown with most common ('Private')
    df['workclass'] = df['workclass'].apply(lambda x: df['workclass'].value_counts().index[0] if x == '?' else x)

    # TRY EXCEPT SO VALIDATION SET CAN PASS
    # Binarize the target variable
    try:
        df['income'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)
    except:
        pass

    # keep track of rows indexes to connect X and y
    df = df.reset_index()

    # Deal with Categorical Values

    # initialize encoder
    encoder = OneHotEncoder(sparse_output=False)
    # pull categorical cols
    categorical_cols = df.select_dtypes(include='object').columns
    # create encoded array w one hot columns
    encoded_array = encoder.fit_transform(df[categorical_cols])
    # Get new column names
    encoded_cols = encoder.get_feature_names_out(categorical_cols)
    # Creae new df of encoded columns
    encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols, index=df.index)
    # Drop the original cat cols, add the one hot encoded one
    df = df.drop(categorical_cols, axis=1)
    df = pd.concat([df, encoded_df], axis=1)
    df = df.drop('index', axis = 1)

    # Ensure all columns are numeric after one-hot encoding
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # TRY EXCEPTS SO THE VALIDATION SET CAN PASS WITH NO Y VARIABLE
    # Separate features and target
    try:
        X = df.drop(columns=['income'], axis=1)
    except:
        X = df
    try:
        y = df['income']
    except:
        y = None

    # Standardize numerical features
    numeric_cols = ['age','fnlwgt','capital-gain','capital-loss', 'hours-per-week']
    scaler = StandardScaler()
    # Ensure numeric columns exist before scaling
    numeric_cols_exist = [col for col in numeric_cols if col in X.columns]
    if numeric_cols_exist:
        X[numeric_cols_exist] = scaler.fit_transform(X[numeric_cols_exist])


    return X, y

In [4]:
# Run function on both datasets
X, y = preprocess_data(data)
X_validation, _ = preprocess_data(validation)
# Ensure validation set has same columns as training set and in right order
missing_cols = [
    'native-country_Holand-Netherlands',
    'native-country_Outlying-US(Guam-USVI-etc)',
    'workclass_Never-worked'
]
for col in missing_cols:
    X_validation[col] = 0
X_validation = X_validation.reindex(columns=X.columns, fill_value=0)
# Split the data 80-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)


## MLP

This will need to be edited

In [5]:
class SimpleMLP:

    def __init__(self, num_features, num_hidden, num_classes, weight_h: Optional[np.ndarray] = None, bias_h: Optional[np.ndarray] = None, weight_out: Optional[np.ndarray] = None, bias_out: Optional[np.ndarray] = None, random_seed=17):
        super().__init__()
        """
        Simple MLP Classifier with 1 hidden layer

        Parameters
        ------------
        num_features : int
          number of features in data
        num_hidden : int
          number of nodes in the hidden layer
        num_classes : int
          number of classes in response variable
        weight_h : array
          array of weights for hidden layer weights
        bias_h : array
          array of weights for hidden layer bias
        weight_out : array
          array of weights for output layer weights
        bias_out : array
          array of weights for output layer bias
        random_seed : int
          seed to use for random number generation for weights    
        """
        self.num_classes = num_classes
        
        # hidden
        if weight_h is None:
            print("creating random weights for weight_h")
            rng = np.random.RandomState(random_seed)
            
            self.weight_h = rng.normal(
                loc=0.0, scale=0.1, size=(num_hidden, num_features))
            self.bias_h = np.zeros(num_hidden)
        else: 
            self.weight_h = weight_h

        if bias_h is None:
            print("creating random weights for bias_h")
            self.bias_h = np.zeros(num_hidden)
        else: 
            self.bias_h = bias_h
        
        # output
        if weight_out is None:
            print("creating random weights for weight_out")
            self.weight_out = rng.normal(
                loc=0.0, scale=0.1, size=(num_classes, num_hidden))
        else: 
            self.weight_out = weight_out

        if bias_out is None:
            print("creating random weights for bias_out")
            self.bias_out = np.zeros(num_classes)
        else: 
            self.bias_out = bias_out
    
    """
    USE DIFFERENT ACTIVATION FUNCTIONS

    COMPARE:
    IDENTITY
    SIGMOID
    SOFTMAX
    RELU
    TANH
    """
    
    def activation(self, X):
        """Compute linear activation"""
        return X
        
    def forward(self, x):
        # Hidden layer
        # input dim: [n_examples, n_features] dot [n_hidden, n_features].T
        # output dim: [n_examples, n_hidden]
        z_h = np.dot(x, self.weight_h.T) + self.bias_h
        #print(f"z_h = {x} dot {self.weight_h.T} + {self.bias_h}")
        a_h = self.activation(z_h)
        #print(f"a_h: {a_h}")

        # Output layer
        # input dim: [n_examples, n_hidden] dot [n_classes, n_hidden].T
        # output dim: [n_examples, n_classes]
        z_out = np.dot(a_h, self.weight_out.T) + self.bias_out
        #print(f"z_out = {a_h} dot {self.weight_out.T} + {self.bias_out}")
        a_out = self.activation(z_out)
        #print(f"a_out: {a_out}")
        return a_h, a_out

    def backward(self, x, a_h, a_out, y_onehot):  

        # assuming onehot for y
  
        # Part 1: dLoss/dOutWeights
        ## = dLoss/dOutAct * dOutAct/dOutNet * dOutNet/dOutWeight
        ## where DeltaOut = dLoss/dOutAct * dOutAct/dOutNet
        ## for convenient re-use
        
        # input/output dim: [n_examples, n_classes]
        d_loss__d_a_out = (a_out - y_onehot)

        # input/output dim: [n_examples, n_classes]
        d_a_out__d_z_out = 1

        # [n_examples, n_hidden]
        d_z_out__dw_out = a_h
        
        # gradient for output weights
        # want output dim: [n_classes, n_hidden]
        # current have [n_examples, n_classes] * [n_examples, n_classes] * [n_examples, n_hidden], which isn't possible. 
        # combine matrix with same [n_examples, n_classes]
        # output dim: [n_examples, n_classes]
        delta_out = d_loss__d_a_out * d_a_out__d_z_out # "delta (rule) placeholder"

        # input dim: [n_classes, n_examples] dot [n_examples, n_hidden]
        # output dim: [n_classes, n_hidden]
        d_loss__dw_out = np.dot(delta_out.T, d_z_out__dw_out)

        # gradient for output bias
        d_loss__db_out = np.sum(delta_out, axis=0)
        

        #################################        
        # Part 2: dLoss/dHiddenWeights
        ## = DeltaOut * dOutNet/dHiddenAct * dHiddenAct/dHiddenNet * dHiddenNet/dWeight
        
        # [n_classes, n_hidden]
        d_z_out__a_h = self.weight_out
        
        # output dim: [n_examples, n_hidden]
        d_loss__a_h = np.dot(delta_out, d_z_out__a_h)
        
        # [n_examples, n_hidden]
        d_a_h__d_z_h = 1
        
        # [n_examples, n_features]
        d_z_h__d_w_h = x
        
        # output dim: [n_hidden, n_features]
        d_loss__d_w_h = np.dot((d_loss__a_h * d_a_h__d_z_h).T, d_z_h__d_w_h)
        d_loss__d_b_h = np.sum((d_loss__a_h * d_a_h__d_z_h), axis=0)

        return (d_loss__dw_out, d_loss__db_out, d_loss__d_w_h, d_loss__d_b_h)

In [6]:
def mse_loss(onehot_targets, probas):
    #print(f"onehot_targets = {onehot_targets}")
    #print(f"probas = {probas}")
    mse = np.mean((onehot_targets - probas)**2)
    return mse

In [9]:
y_test

11858    0
4818     0
19478    1
5539     0
20738    0
        ..
3746     0
14618    0
17947    0
11690    0
21290    1
Name: income, Length: 5210, dtype: int64

In [10]:
encoder = OneHotEncoder(sparse_output=False)
y_hot_train = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_hot_test = encoder.transform(y_test.values.reshape(-1, 1))
x = np.array([[1, 2]])
bias_h = np.array([[0.1, 0.1]])
bias_out = np.array([[0.2, 0.3]])

In [18]:
def train(model, X_train, X_test, y_onehot_train, y_onehot_test, num_epochs, num_classes, learning_rate=0.01):

    """
    Training Simple MLP modle using entire data set (no train/test split)
    """
  
    epoch_loss_train = []
    epoch_loss_test = []
    
    for e in range(num_epochs):
        print(f"****Epoch: {e}****")

        #### Compute outputs ####
        a_h, a_out = model.forward(X_train)

        #### Compute gradients ####
        d_loss__d_w_out, d_loss__d_b_out, d_loss__d_w_h, d_loss__d_b_h = model.backward(X_train, a_h, a_out, y_onehot_train)

        #### Update weights ####
        model.weight_h -= learning_rate * d_loss__d_w_h
        model.bias_h -= learning_rate * d_loss__d_b_h
        model.weight_out -= learning_rate * d_loss__d_w_out
        model.bias_out -= learning_rate * d_loss__d_b_out

        ### Look at test dataset ###
        _, a_out_test = model.forward(X_test)
    
        #### Epoch Logging ####        
        mseTrain = mse_loss(y_onehot_train, a_out)
        mseTest = mse_loss(y_onehot_test, a_out_test)
        epoch_loss_train.append(mseTrain)
        epoch_loss_test.append(mseTest)
        print(f'MSE Train: {mseTrain:.1f}')
        print(f'MSE Test: {mseTest:.1f}')

    return epoch_loss_train, epoch_loss_test



In [19]:
model = SimpleMLP(num_features=len(X_train.columns), num_hidden=2, num_classes=2)

creating random weights for weight_h
creating random weights for bias_h
creating random weights for weight_out
creating random weights for bias_out


In [20]:
epoch_loss_train, epoch_loss_test = train(model, X_train, X_test, y_hot_train, y_hot_test, num_epochs=3, learning_rate=0.01, num_classes = 2)

****Epoch: 0****
MSE Train: 0.4
MSE Test: 10237658.5
****Epoch: 1****
MSE Train: 10307049.6
MSE Test: 100110489674944197927835611879702528.0
****Epoch: 2****
MSE Train: 99505957628026404988086907855437824.0
MSE Test: 94217598234864560123428139251271320708185420661012760012049102121676318939118083897014709384849676620960792948168982528.0


## Reflection and Conceptual Questions

### Why did you choose the specific architecture (e.g., number of layers, activation functions) for each model?

### How did you monitor and mitigate overfitting in your models?

### What ethical concerns might arise from deploying models trained on these datasets?

### Why are activation functions necessary in neural networks?