In [62]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from pathlib import Path
from tensorflow.keras import layers
from tensorflow import feature_column
from matplotlib import pyplot as plt
# Serves to check that the Tensor Flow version is correct and imported correctly
print('tensorflow version', tf.__version__)
print('pandas', pd.__version__)

tensorflow version 2.3.1
pandas 1.0.5


In [63]:
# Read in data and create a dataframe.
# Create a columns array
columns = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
columns_float = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
# Later we will need to split the data set into validation, test and training
df = pd.read_csv('BreastCancerDataMeans.csv', names=columns, index_col=False)
# Drop the ID column which we do not need, as well as the redundant headings
df = df.drop('id', 1)
df = df.drop(0)
# Need to convert the object values into floats and the diagnosis into a string
df['diagnosis'] = df.apply(lambda x: np.array(x['diagnosis']).astype(str), axis=1)
for i in range(len(columns_float)):
    df[columns_float[i]] = df.apply(lambda x: np.array(x[columns_float[i]]).astype(float), axis=1)
df.head()
# Need to convert the Malignant or Bengin result into a binary value
df['diagnosis'] = df['diagnosis'].map({'M': 0, 'B':1})
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
1,0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871
2,0,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667
3,0,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999
4,0,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744
5,0,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883
...,...,...,...,...,...,...,...,...,...,...,...
565,0,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623
566,0,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533
567,0,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648
568,0,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016


In [64]:
# Split the data. Use a 75% training and 25% test split
# test_size is how much of the data will be used for testing
# random_state controls the shuffling to tae place, by defining the random state we can reproduce the same split of the data across multiple function calls
# shuffle, we will enable this as the data may be organised in a certain way which could affect our results
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42, shuffle=True)
# Define path for test and training data to be saved and convert them to CSV
train_path = Path('train.tsv')
test_path = Path('test.tsv')
train_df.to_csv(train_path, sep='\t', index=False)
test_df.to_csv(test_path, sep='\t', index=False)

In [65]:
# Normalize the data over all the values, besides for target value
# Calculate the z score
# All rows, columns between 1 and 5
#Normalising
train_df_norm = train_df.iloc[:,1:5].sub(train_df_mean, axis=1)
train_df_norm = train_df_norm.div(train_df_std, axis=1)

#reassigning to dataframe
#if rerunning, need to reimport data. the drop function permenantly changes dataframe meaning it can only operate once
train_df_temp = train_df
train_df_temp = train_df_temp.drop(columns = ["radius_mean",  "texture_mean",  "perimeter_mean",  "area_mean"]) #colums 1,2,3,4
train_df = pd.concat([train_df_temp, train_df_norm], axis = 1)

In [66]:
print(train_df)
test_df_norm = test_df.iloc[:,1:5].sub(test_df_mean, axis=1)
test_df_norm = test_df_norm.div(test_df_std, axis=1)

#reassigning to dataframe
#if rerunning, need to reimport data. the drop function permenantly changes dataframe meaning it can only operate once
test_df_temp = test_df
test_df_temp = test_df_temp.drop(columns = ["radius_mean",  "texture_mean",  "perimeter_mean",  "area_mean"])
test_df = pd.concat([test_df_temp, test_df_norm], axis = 1)


     diagnosis  smoothness_mean  compactness_mean  concavity_mean  \
288          1          0.06955           0.03729         0.02260   
513          0          0.11060           0.14690         0.14450   
403          1          0.07351           0.07899         0.04057   
447          0          0.09997           0.13140         0.16980   
211          0          0.09090           0.13480         0.16400   
..         ...              ...               ...             ...   
72           1          0.09783           0.15310         0.08606   
107          1          0.11420           0.10170         0.07070   
271          1          0.06429           0.02675         0.00725   
436          0          0.10600           0.11330         0.11260   
103          1          0.08013           0.04038         0.02383   

     concave points_mean  symmetry_mean  fractal_dimension_mean  radius_mean  \
288              0.01171         0.1337                 0.05581    -0.207600   
513        

NameError: name 'test_df_std' is not defined

In [67]:
featureColumns = []
featureColumns.append(tf.feature_column.numeric_column('radius_mean'))
featureColumns.append(tf.feature_column.numeric_column('texture_mean'))
featureLayer = layers.DenseFeatures(featureColumns)

In [68]:
def createModel(myLearningRate, featureLayer, myMetrics):
    # Most simple tf.keras models are sequential.
    model = tf.keras.models.Sequential()
    # Add the list of features and how they are represented
    model.add(featureLayer)
    # Use a sigmoid activation function to funnel
    model.add(tf.keras.layers.Dense(units=1, input_shape=(1,), activation=tf.sigmoid),)
    # Compile method will construct the layers into a model that TensorFlow can execute
    # RMSprop is used to maintain a moving average of the square of the gradients, then divide the gradient by the root of the average
    model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=myLearningRate), loss=tf.keras.losses.binary_crossentropy, metrics=myMetrics)
    return model

In [69]:
# Feed in a dataset into the model to train it
# Epoch is a run through of the whole dataset, if we batch size then we only pass through parts of the dataset at a time
# Epoch and batchSize will be changed to reduce the error rate
def trainModel(model, dataset, epochs, labelName, batchSize = None, shuffle = True):
    features = {name:np.array(value) for name, value in dataset.items()}
    label = np.array(features.pop(labelName))
    # x paramter can be an array of the data for each feature. Feature layer will be filtering out the columns we don't want
    history = model.fit(x=features, y=label, batch_size=batchSize, epochs=epochs, shuffle=shuffle)
    # Can see the output of the model being trained over time
    epochs = history.epochs
    # Isolate the classification metric for each epoch
    hist = pd.DataFrame(history.history)
    return epochs, hist

In [70]:
# Plot the trained model against random training examples using MatPlotLib
def plotModel(trainedWeight, trainedBias, feature, label):
    plt.xlabel(feature)
    plt.ylabel(label)
    randomExamples = training_df.sample(n=200)
    plt.scatter(randomExamples[feature], randomExamples[label])

    x0 = 0
    y0 = trainedBias
    x1 = 10000
    y1 = trainedBias + (trainedWeight * x1)
    plt.plot([x0, x1], [y0, y1], c="r")
    plt.show()

In [71]:
# Define and plot the loss curve, which will show how the model performs over epochs
def plotLossCurve(epochs, rmse):
    plt.figure()
    plt.xlabel("Epochs")
    plt.ylabel("RMSE")
    plt.plot(epochs, rmse, label="Loss")
    plt.legend()
    plt.ylim([rmse.min()*0.97, rmse.max()])
    plt.show()

In [78]:
# Invoke the functions
learningRate = 0.01
epochs = 200
batchSize = 10
labelName = "diagnosis"
classificationThreshold = 0.5

metrics = [tf.keras.metrics.BinaryAccuracy(name="Accuracy", threshold=classificationThreshold)]
myModel = createModel(learningRate, featureLayer, metrics)
epochs, hist = trainModel(myModel, train_df, epochs, 
                           labelName, batchSize)
metricsToPlot = ['accuracy']
#plotLossCurve(epochs, hist, metricsToPlot)

0.6291
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200


AttributeError: 'History' object has no attribute 'epochs'