In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from pathlib import Path
from tensorflow.keras import layers
from tensorflow import feature_column
# Serves to check that the Tensor Flow version is correct and imported correctly
print('tensorflow version', tf.__version__)

In [None]:
# Read in data and create a dataframe.
# Create a columns array
columns = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
columns_float = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
# Later we will need to split the data set into validation, test and training
df = pd.read_csv('BreastCancerDataMeans.csv', names=columns, index_col=False)
# Drop the ID column which we do not need, as well as the redundant headings
df = df.drop('id', 1)
df = df.drop(0)
# Need to convert the object values into floats and the diagnosis into a string
df['diagnosis'] = df.apply(lambda x: np.array(x['diagnosis']).astype(str), axis=1)
for i in range(len(columns_float)):
    df[columns_float[i]] = df.apply(lambda x: np.array(x[columns_float[i]]).astype(float), axis=1)
df.head()
# Need to convert the Malignant or Bengin result into a binary value
df['diagnosis'] = df['diagnosis'].map({'M': 0, 'B':1})

In [None]:
# Split the data. Use a 75% training and 25% test split
# test_size is how much of the data will be used for testing
# random_state controls the shuffling to tae place, by defining the random state we can reproduce the same split of the data across multiple function calls
# shuffle, we will enable this as the data may be organised in a certain way which could affect our results
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42, shuffle=True)
# Define path for test and training data to be saved and convert them to CSV
train_path = Path('train.tsv')
test_path = Path('test.tsv')
train_df.to_csv(train_path, sep='\t', index=False)
test_df.to_csv(test_path, sep='\t', index=False)

In [None]:
# Normalize the data over all the values, besides for target value
# Calculate the z score
# All rows, columns between 1 and 5
train_df_mean = train_df.iloc[:,1:5].mean()
train_df_std = train_df.iloc[:,1:5].std()
train_df_norm = (train_df - train_df_mean)/train_df_std

test_df_mean = test_df.mean()
test_df_sd = test_df.std()
test_df_norm = (test_df - test_df_mean)/test_df_sd

In [None]:
featureColumns = []
featureColumns.append(tf.feature_column(train_df['radius_mean']))
featureColumns.append(tf.feature_column(train_df['texture_mean']))
featureLayer = layers.DenseFeatures(featureColumns)

In [None]:
def createModel(myLearningRate, featureLayer, myMetrics):
    # Most simple tf.keras models are sequential.
    model = tf.keras.models.Sequential()
    # Add the list of features and how they are represented
    model.add(featureLayer)
    # Use a sigmoid activation function to funnel
    model.add(tf.keras.layers.Dense(units=1, input_shape=(1,), activation=tf.sigmoid),)
    # Compile method will construct the layers into a model that TensorFlow can execute
    # RMSprop is used to maintain a moving average of the square of the gradients, then divide the gradient by the root of the average
    model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=myLearningRate), loss=tf.keras.losses.BinaryCrossEntropy(),metrics=myMetrics)
    return model

In [None]:
# Feed in a dataset into the model to train it
# Epoch is a run through of the whole dataset, if we batch size then we only pass through parts of the dataset at a time
# Epoch and batchSize will be changed to reduce the error rate
def trainModel(model, dataset, epochs, labelName, batchSize = None, shuffle = True):
    features = {name:np.array(value) for name, value in dataset.items()}
    label = np.array(features.pop(label_name))
    # x paramter can be an array of the data for each feature. Feature layer will be filtering out the columns we don't want
    history = model.fit(x=features, y=label, batch_size=batchSize, epochs=epochs, shuffle=shuffle)
    # Can see the output of the model being trained over time
    epochs = history.epochs
    # Isolate the classification metric for each epoch
    hist = pd.DataFrame(history.history)
    return epochs, hist

In [None]:
# Plot the trained model against random training examples using MatPlotLib
def plotModel(trainedWeight, trainedBias, feature, label):
    plt.xlabel(feature)
    plt.ylabel(label)
    randomExamples = training_df.sample(n=200)
    plt.scatter(randomExamples[feature], randomExamples[label])

    x0 = 0
    y0 = trainedBias
    x1 = 10000
    y1 = trainedBias + (trainedWeight * x1)
    plt.plot([x0, x1], [y0, y1], c="r")
    plt.show()

In [None]:
# Define and plot the loss curve, which will show how the model performs over epochs
def plotLossCurve(epochs, rmse):
    plt.figure()
    plt.xlabel("Epochs")
    plt.ylabel("RMSE")
    plt.plot(epochs, rmse, label="Loss")
    plt.legend())
    plt.ylim([rmse.min()*0.97, rmse.max()])
    plt.show()