In [None]:
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

from keras.models import Sequential
from keras.layers import Dense, Layer
from tensorflow import keras

from sklearn.base import BaseEstimator, TransformerMixin
from statistics import mean

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as snsb
import collections, functools, operator
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/ECE_449/2/wine.data')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Making a Pipeline
# Deprecated
class LoadData(BaseEstimator, TransformerMixin):
  """
  This class mounts google drive and reads the data from file to a DataFrame
  """
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    drive.mount('/content/drive')
    X = pd.read_csv('/content/drive/MyDrive/ECE_449/2/wine.data')
    return X

class MeanImputer(BaseEstimator, TransformerMixin):
  """
    Class to data impute mean values in to rows in columns with no values "nan".

    ...
    Methods
    ```````
    fit :  returns self, mainly used for fit_transform
    transform : Mean data imputes dataset
  """
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    """ Assigns names to the columns

      Parameters
      ``````````
      X (DataFrame) : Dataset to be data imputed

      Returns
      ```````
      X (DataFrame) : data imputed Dataset
    """
    X = X.fillna(X.mean())
    return X

class FormatClass(BaseEstimator, TransformerMixin):
  """
    Class to give the columns of the dataset names

    ...
    Methods
    ```````
    fit :  returns self, mainly used for fit_transform
    transform : will assign names to columns.
  """
  def fit(self, X, y=None):
    return self


  def transform(self, X):
    """ Assigns names to the columns

      Parameters
      ``````````
      X (DataFrame) : Dataset with columns to be renamed

      Returns
      ```````
      X (DataFrame) : renamed columns in Dataset
    """
    column_names = ["Class", "Alcohol", "Malic Acid", "Ash", "Alcalinity of Ash", "Magnesium", "Total Phenols",
                "Flavanoids", "Nonflavanoid Phenols", "Proanthocyanins", "Color Intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
    X.columns = column_names
    return X

class Scaler(BaseEstimator, TransformerMixin):
  """
    Class to MinMax Scale the dataset except for the target values

    ...
    Methods
    ```````
    fit : returns self, mainly used for fit_transform
    transform : will MinMax Scale the desired columns in dataset.
  """
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    """ MinMax scales the dataset except for the target columns

      Parameters
      ``````````
      X (DataFrame) : Dataset to be scaled

      Returns
      ```````
      X (DataFrame) : Scaled Dataset
    """
    scaler = MinMaxScaler()
    X.loc[:,["Alcohol", "Proline"]] = scaler.fit_transform(X.loc[:,["Alcohol", "Proline"]])
    return X

class FeatureEncoder(BaseEstimator, TransformerMixin):
  """
    Class to One-hot-encode the dataset

    ...
    Methods
    ```````
    fit : returns self, mainly used for fit_transform
    transform : will encode "Class" column of the dataset

  """
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    """ Creates a OneHotEncoder object, separates the X and y from the given
        dataset then the y is OneHotEncoded into 'Class_1', 'Class_2', and 'Class_3'
        then joins them back together and returns the joined dataset.

    Parameters
    ``````````
    X (DataFrame) : Dataset to be one-hot encoded.

    Returns
    ```````
    X (DataFrame) : One-hot encoded dataset
    """
    encoder = OneHotEncoder(sparse_output = False)

    # Converting type of columns to category
    X["Class"] = X["Class"].astype('category')

    X_num = X.select_dtypes(exclude='category')
    X_cat = X.select_dtypes(include='category')
    df_new = pd.get_dummies(X_cat, columns = ["Class"], prefix = "Class")
    column_name = df_new.columns
    encoded_cat = encoder.fit_transform(X_cat)
    one_hot_features = pd.DataFrame(encoded_cat, columns = column_name)
    X = X_num.join(one_hot_features)
    return X

class argmax_layer(Layer):
  """
    Class to create Argmax_layer on the MLP
  """
  def __init__(self):
    super(argmax_layer, self).__init__()

  def call(self, inputs):
    return tf.math.argmax(inputs, axis=1)

In [None]:
class Classifier(BaseEstimator, TransformerMixin):
  """
  Creates and trains classifier on the Stratified K Fold

  ...
  Methods
  ```````
  make_model:
    makes desired Sequential model
  evaluate:
    Evaluates model
  fit:
    To fit the dataset with model and to find the optimal model
  transform:
    returns model
  """

  # neuron must be the same length as the layer and the first index is the highest layer.
  def make_model(self, layer = 1, neuron = [], lr = 0.0):
    """ Makes a model with the given values of layers, neurons, and learning rate.
        The last layer will have 3 neurons and will have softmax activation on the output layer
        and also argmax to get one val for one hot encoded.

        Return: returns a made model
    Parameters
    ``````````
    layer : int, default = 1
             decides how many layers the model will have

    neuron : int list, default = []
            list of neuron values. The first index will be the outer layer
            neuron, then the next index will be the next layer and the last value
            will be the last neuron.
            The max length of the list can only be 3.

    lr : float64, default = 0.0
          Value for the learning rate of the model

    Returns
    ```````
    model (Sequential) : Model made with the parameters given, with softmax activation layer
                          and argmax.
    ``````
    """
    model = Sequential()
    if layer >= 1:
      # decide if layer 1 or layer 2, then go through the neurons
      if layer == 2:
        print("2 layer and we are adding neurons {}, and {}".format(neuron[0], neuron[1]))
        model.add(Dense(neuron[0], input_dim = 8, activation = 'relu'))
        model.add(Dense(neuron[1], activation = 'relu'))
      else:
        print("1 layer and we are adding neuron = {}".format(neuron[0]))
        model.add(Dense(neuron[0], input_dim = 8, activation = 'relu'))

    print("Making output layer and compile")
    model.add(Dense(3, activation = "softmax"))
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),loss='categorical_crossentropy', metrics=['accuracy'])
    # adding argmax to the model to spit out one value
    # https://stackoverflow.com/a/72850239
    # model.add(Lambda(lambda x: k.cast(k.argmax(x), dtype = 'int64'), name = 'y_pred'))
    # or
    model.add(argmax_layer())
    model.summary()
    return model

  def evaluate(self, model, data):
    """ Will evaluate the performance of MLP model and the stratified K Fold in
        a range of k values.

    Parameters
    ``````````
    model (Sequential): MLP model we are evaluating in.
    data (DataFrame): Non-one hot encoded dataset.

    Returns
    ````````
    mean_results (float) : returns the mean scores of the test of different k
                          params for StratfiedKFold on the model
    results_kfold_mean (dict) : mean of all the scores from kfold test on the model
    """
    # Data can't be one hot encoded
    # X = data.loc[:,'Alcohol' : 'Proline']
    # y = data.loc[:, 'Class_1' : 'Class_3']
    y = data["Class"]
    skf = StratifiedKFold()
    results_kfold_mean = {}
    result = []

    kFold_range = list(range(2,10))
    param_grid = dict(kFold__n_splits = range(2, 10))

    fold = range(3, 10)
    list_scores = []

    skf = StratifiedKFold()

    pipe = Pipeline([
      ("encoder", FeatureEncoder()),
      ("scale", Scaler())
    ])

    temp_data = pipe.fit_transform(data)
    temp_data.info()
    X = temp_data.loc[:,'Alcohol' : 'Proline']
    y = temp_data.loc[:, 'Class_1' : 'Class_3']
    y_temp = y

    for k in kFold_range:
      skf = StratifiedKFold(n_splits = k)
      StratifiedKFold()
      scores = []
      """
      ERROR: ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

      I think the issue is because of the fact that Stratified K fold cannot take a dataset with more than 1 columns
      as a target dataset.
        This issue has halted my progress in finishing this lab

        Steps taken to fix issue:
        - I have googled and stackoverflowed this problem and cannot find a solution.
        - I have emailed lab instructor about issue, and got no response
        - I have emailed the lab TA Amir Noohian and have gotten a possible solution
          - Implement argmax to MLP
            - I have implemented argmax to MLP just to find that that wasn't the issue.
              - I discovered the issue is because of k fold not being able to partition
                my one-hot encoded target values.

      I am not at the end of my lab project. I can no longer continue from here.
      I hope I am able to get partial marks for the effort that I have put into this lab
        I really tried my best and this is my best.
      """
      for train, test in skf.split(X, y):
        print("\nThis is the y value with train index, {} and y_train".format(y[train], scores))
      # Fit the model
        model.fit(X[train], y_temp[train], epochs = 150, batch_size = 10, verbose = 0)
        scores = model.evaluate(X[test], y_temp[test], verbose = 0)
        list_scores.append(scores[1] * 100)
      results_kfold_mean[str(k)] = mean(scores)

    mean_results = mean(sum(results_kfold_mean.values()))
    return mean_results, results_kfold_mean

  def optimal(self, X):
    """
      Finds optimal model and K value of stratified K Fold

      Parameters
      ``````````
      X (DataFrame) : dataset to train the model with
    """
    model = Sequential()
    n_cols = X.shape[1] - 3

    num_layers = [1,2,3]
    neurons_val = [32,64,128]
    learning_rate_arr = [0.001, 0.01, 0.1]

    # The combination with the highest mean scores will be the optimal model
    # mean_result (key) : combo list (value)
    mean_results_dict = {}

    # Will merge all the collected values then add all the same keys
    # The key with the max value will be the ideal K value
    results_kfold_mean = {}

    # Parameter Exploration
    for layer in num_layers:
      for i in range(0, len(neurons_val)-1):
        for rate in learning_rate_arr:
          # Trying different neuron vals.
          neuron = []
          for j in range(i, -1, -1):
            neuron.append(neurons_val[j])
          if len(neuron) < layer:
            for k in range(0, layer - len(neuron)):
              neuron.append(max(neuron))
          print(neuron)
          model = self.make_model(layer = layer, neuron = neuron, lr = rate)
          mean_results, kfold_dict = self.evaluate(model, X)
          combo = [layer, neuron, rate]
          mean_results_dict[mean_results] = combo

          results_kfold_mean = {**results_kfold_mean, **kfold_dict}

    print("init dict: ", str(results_kfold_mean))
    result = dict(functools.reduce(operator.add, map(collections.Counter, results_kfold_mean)))
    print("final dict: ", str(result))

    optimal = 0
    for key in mean_results_dict:
      if key > optimal:
        optimal = key

    arr_optimal = mean_results_dict[optimal]
    ideal_model = self.make_model(layer = arr_optimal[0], neuron = arr_optimal[1], lr = arr_optimal[2])
    ideal_model.summary()
    return ideal_model

  def fit(self, X, y=None):
    model = Sequential()
    model = self.optimal(X)

    X_num = X.loc[:,'Alcohol' : 'Proline']
    y = X.loc[:, 'Class_1' : 'Class_3']
    model.fit(X_num, y)
    return self

  def transform(self, X):
    return X

In [None]:
data = pd.DataFrame()
load = LoadData()
data = load.fit_transform(data)

pipe = Pipeline([
    ("imputer", MeanImputer()),
    ("format", FormatClass()),
    ("mlp", Classifier())
])

pipe.fit_transform(data)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[32]
1 layer and we are adding neuron = 32
Making output layer and compile
Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_14 (Dense)            (None, 32)                288       
                                                                 
 dense_15 (Dense)            (None, 3)                 99        
                                                                 
 argmax_layer_7 (argmax_lay  (None,)                   0         
 er)                                                             
                                                                 
Total params: 387 (1.51 KB)
Trainable params: 387 (1.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
<class 'pandas.co

ValueError: ignored

On cell 5 line 113

https://colab.research.google.com/drive/1X-6j4YCjPddry6k8yIhXhjycvgVoWlTC#scrollTo=bbIYFC-1bEW-&line=113&uniqifier=1

---
ERROR: ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

I think the issue is because of the fact that Stratified K fold cannot take a dataset with more than 1 columns
as a target dataset.
  This issue has halted my progress in finishing this lab
  
  Steps taken to fix issue:
  - I have googled and stackoverflowed this problem and cannot find a solution.
  - I have emailed lab instructor about issue, and got no response
  - I have emailed the lab TA Amir Noohian and have gotten a possible solution
    - Implement argmax to MLP
      - I have implemented argmax to MLP just to find that that wasn't the issue.
        - I discovered the issue is because of k fold not being able to partition
          my one-hot encoded target values.

I am not at the end of my lab project. I can no longer continue from here.
I hope I am able to get partial marks for the effort that I have put into this lab
  I really tried my best and this is my best.

