Data taken from https://archive.ics.uci.edu/ml/datasets/Yeast

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# !pip install -q git+https://github.com/tensorflow/docs
# import tensorflow_docs as tfdocs
# import tensorflow_docs.modeling
# import tensorflow_docs.plots

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
df = pd.read_csv('./yeast.csv', header=None)
df = df.drop(0, axis=1)
df.head()

FileNotFoundError: [Errno 2] File ./yeast.csv does not exist: './yeast.csv'

In [None]:
col_headers = ['mcg','gvh', 'alm', 'mit', 'erl', 'pox','vac', 'nuc', 'class']
df.columns = col_headers
df.head()

In [None]:
# Change the class column from type object to int
codes, uniques = pd.factorize(df['class'])
df['class'] = codes

In [None]:
df.shape

In [None]:
df['class'].value_counts().sort_index() / len(df) * 100

Major inequality in class distributions (imbalanced classes)

# EDA

In [None]:
import seaborn as sns

# sns.pairplot(df.iloc[:, :-1])

I don't know if we need to normalize our input features - whether that would help or not. 

# Model

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.framework import ops
# from tf_utils import load_dataset, random_mini_batches, convert_to_one_hot, predict

First let's get some practice with Keras by building a model to predict the MNIST fashion data set classifications

In [None]:
# (x_train, y_train), (x_val, y_val) = keras.datasets.fashion_mnist.load_data()

Well that doesn't work for some reason, so let's use our data!

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=1)

In [None]:
# NOTE: not sure how to set the batch size
BATCH_SIZE = 128
N_TRAIN = len(x_train)
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

In [None]:
def preprocess(x, y):
    x = tf.cast(x, tf.float32)
    y = tf.cast(y, tf.int64)

    return x, y

def create_dataset(xs, ys, n_classes=10):
    ys = tf.one_hot(ys, depth=n_classes)
    return tf.data.Dataset.from_tensor_slices((xs, ys)) \
        .map(preprocess) \
        .shuffle(len(ys)) \
        .batch(BATCH_SIZE)

In [None]:
n_classes = df['class'].nunique()

In [None]:
train_dataset = create_dataset(x_train, y_train, n_classes)
val_dataset = create_dataset(x_val, y_val, n_classes)

In [None]:
n_features = len(col_headers) - 1

In [None]:
from tensorflow.keras import regularizers

In [None]:
size_histories = {}

In [None]:
model = keras.Sequential([
#     keras.layers.Reshape(target_shape=(n_features,), input_shape=(n_features)),
    keras.layers.Dense(units=256, activation='relu'),
    keras.layers.Dense(units=192, activation='relu'),
    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dense(units=n_classes, activation='softmax')
])

model.compile(
    optimizer='adam', 
    loss=tf.losses.CategoricalCrossentropy(from_logits=True),
    metrics=[
        tf.losses.CategoricalCrossentropy(from_logits=True, name='categorical_crossentropy'),
        'accuracy'
    ]
)

history = model.fit(
    train_dataset.repeat(), 
    epochs=10, 
    steps_per_epoch=500,
    validation_data=val_dataset.repeat(), 
    validation_steps=2
)

In [None]:
size_histories['Model_0'] = history

In [None]:
# plotter = tfdocs.plots.HistoryPlotter(metric = 'categorical_crossentropy', smoothing_std=100)
plotter = tfdocs.plots.HistoryPlotter(metric = 'categorical_crossentropy')
plotter.plot(size_histories)

The rule based system outlined [here](https://pubmed.ncbi.nlm.nih.gov/1946347) got 83% accuracy. We're getting 51.6% accuracy in our test set, which although is worse, is perhaps not that bad if compared to a naive classifier based on the class distributions. 

Since the train accuracy is so much higher (62.5%) than our test accuracy (51.6%), it seems as if we're suffering from high variance as well. Perhaps we can combat that by:
- adding regularization 
- adding more data (not possible here)
- making the classes more balanced (not sure how we can do that here)

In [None]:
## New model with l2 regularization added to apply a penalty to softmax layer
model = keras.Sequential([
    keras.layers.Dense(units=256, activation='relu'),
    keras.layers.Dense(units=192, activation='relu'),
    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dense(units=n_classes, activation='softmax', kernel_regularizer=regularizers.l2(0.0001))
])

model.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    train_dataset.repeat(), 
    epochs=10, 
    steps_per_epoch=500,
    validation_data=val_dataset.repeat(), 
    validation_steps=2
)

52.3% - Not really any difference. Maybe I need to 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=500)

In [None]:
model.fit(x_train, y_train)

In [None]:
model.score(x_val, y_val)

(unoptimized) random forest gets slightly higher accuracy of 55%, but pretty much the same performance. 

# Incorporate various regularization techniques and plot outcomes

In [None]:
# create results dictionary
size_histories = {}

In [None]:
# set up learning rate decay in order to optimize fitting
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    0.001,
    decay_steps=STEPS_PER_EPOCH*1000,
    decay_rate=1,
    staircase=False
)

def get_optimizer():
    return tf.keras.optimizers.Adam(lr_schedule)

# create compile and fit function
def compile_and_fit(model, name, optimizer=None, max_epochs=10):
    
    # get optimizer
    if optimizer is None:
        optimizer = get_optimizer()
    
    # compile
    model.compile(
        optimizer=optimizer,
        loss=tf.losses.CategoricalCrossentropy(from_logits=True),
        metrics=[
          tf.losses.CategoricalCrossentropy(from_logits=True, name='categorical_crossentropy'),
          'accuracy'
        ]
    )

    # fit model
    history = model.fit(
        train_dataset.repeat(), 
        epochs=max_epochs, 
        steps_per_epoch=STEPS_PER_EPOCH,
        validation_data=val_dataset.repeat(), 
#         callbacks=get_callbacks(name),
#         validation_steps=2,
        verbose=0
    )
    
    # print summary
    model.summary()
    
    return history

In [None]:
l2_model = keras.Sequential([
    keras.layers.Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    keras.layers.Dense(units=192, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    keras.layers.Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    keras.layers.Dense(units=n_classes, activation='softmax')
])

regularizer_histories['l2'] = compile_and_fit(l2_model, "regularizers/l2")