In [1]:
# CS485 HW1
#Created by: Jack Summers

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical # for one-hot encoding labels
import matplotlib.pyplot as plt
from keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping


train_file = "../input/HW1Train/HW1_data_train.csv" # training data stored in kaggle cloud
label_file = "../input/hw1labels2/HW1_labels.csv" # labels in a seperate file
# I learned more about pandas DataFrame here:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#:~:text=DataFrame%20is%20a%202%2Ddimensional,most%20commonly%20used%20pandas%20object.
# I learned how to convert a pandas.DataFrame to a numoy.ndarray here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_numpy.html
train_data = pd.read_csv(train_file).to_numpy() 
labels = pd.read_csv(label_file).to_numpy()
#print(len(train_data))
# index a specific column df["fixed_acidity"]
#print(type(df["fixed_acidity"][0])) #column one, row one: <class 'numpy.float64'>
#df.head()
#print(type(df))


# normalize the data

# this is highly recommended since different features have values across a variety of ranges
# normalizing your features ALMOST ALWAYS leads to a better model, regardless of the task (classification, regression, etc.)
mean = train_data.mean(axis=0) # mean of all the rows in each column. Found here: https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean#:~:text=So%20a%20mean%20on%20axis,the%20columns%20in%20each%20row.
train_data -= mean
std = train_data.std(axis=0)
train_data /= std
#test_data -= mean
#test_data /= std

#label vectorization. 

labels = to_categorical(labels)


In [2]:
# define the model
import keras

model = Sequential()
# 11 feature values per sample, l2 regularization to prevent overfitting. λ = 0.001
model.add(Dense(11, kernel_regularizer=regularizers.l2(0.001), activation = 'relu', input_shape = (11,)))  # kernel_regularizer=regularizers.l2(0.001)
model.add(Dense(16, activation = 'relu'))
model.add(layers.Dropout(0.1)) # drop out random values to prevent overfitting
model.add(Dense(10, activation = 'softmax')) # 10 options for classifcation. Use softmax for multi-class classification 
model.summary()


#define callbacks
callback_list = []

es_callback = EarlyStopping(monitor = 'val_acc', 
                           min_delta = .001, # after each epoch we want to see the val accuracy imporve by 0.001
                           patience = 10, # if min_delta not seen after 10 epochs, stop training
                           verbose = 1,
                           restore_best_weights = True) # restore weights of peak val accuracy of that epoch

loss_plat = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                 factor = 0.1,
                                 patience = 10)

callback_list.append(es_callback)
callback_list.append(loss_plat)

In [3]:
#  compile model
# "categorical crossentropy is almost always the loss function you should use for multi class classification"
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])

In [4]:
# train model

history = model.fit(x = train_data,
                   y = labels,
                   epochs = 1000,
                   batch_size = 128,#128
                   validation_split = .2,          
                   verbose = 1,
                   callbacks = callback_list)

In [5]:
# plot training vs validation accuracy
plt.clf()
history_dict = history.history
acc = history_dict['acc']
val_acc = history_dict['val_acc']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label = 'Training acc')
plt.plot(epochs, val_acc, 'b', label = 'validation acc')
plt.xlabel('Epochs')
plt.ylabel('accuracy')
plt.show()

In [6]:
# generate predictions

file = '../input/hw1test/HW1_test_data.csv' #input test data
test_data = pd.read_csv(file).to_numpy()    #convert to numpy

#normalize test data
mean = test_data.mean(axis=0) # mean of all the rows in each column. Found here: https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean#:~:text=So%20a%20mean%20on%20axis,the%20columns%20in%20each%20row.
test_data -= mean
std = test_data.std(axis=0)
test_data /= std

predictions = model.predict(test_data) # use model to predict. Input normalized test data


In [7]:
#output predictions to csv

# I used https://realpython.com/python-csv/ to re learn how to create a csv file with python
import csv
count = 0
with open('./HW1_predictions.csv', mode = 'w') as file:
    row = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    row.writerow(['Id', 'Category']) # headers
    for item in predictions:
        row.writerow([count, np.argmax(item)]) # add row: count, index of predicted value
        count +=1