In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
dataset = pd.read_csv('abalone.csv')

#describe the dataset

dataset.describe()

Unnamed: 0,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [3]:
#min height is 0... there should be an issue. let's find the errors
dataset[dataset['Height']==0]


#remove rows with height = 0
dataset = dataset[dataset['Height']>0]

dataset.describe()

Unnamed: 0,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight,Rings
count,4175.0,4175.0,4175.0,4175.0,4175.0,4175.0,4175.0,4175.0
mean,0.524065,0.40794,0.139583,0.829005,0.359476,0.180653,0.238834,9.93509
std,0.120069,0.09922,0.041725,0.490349,0.221954,0.109605,0.139212,3.224227
min,0.075,0.055,0.01,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.44225,0.18625,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.8,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.1535,0.502,0.253,0.32875,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [4]:

#Transform sex into a dummy variable
dataset['Male'] = (dataset['Sex']=='M').astype(int)
dataset['Female'] = (dataset['Sex']=='F').astype(int)
dataset['Infant'] = (dataset['Sex']=='I').astype(int)

dataset.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight,Rings,Male,Female,Infant
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,1,0,0
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1,0,0
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,1,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1,0,0
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,0,1


In [None]:
y = dataset['Rings'] + 1.5 #AGE is rings + 1.5

"""
in real life we do not know the number of rings utill we cut the abalone... 
the purpose of this ML is to predict the age without cutting the abalone and count the rings
"""

dataset = dataset.drop('Rings', axis=1)
dataset = dataset.drop('Sex', axis=1)

In [None]:
dataset.head()

Unnamed: 0,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight,Male,Female,Infant
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,1,0,0
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,1,0,0
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0,1,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,1,0,0
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.33, random_state=42)

In [None]:
X_train.head()


Unnamed: 0,Length,Diameter,Height,Whole_Weight,Shucked_Weight,Viscera_Weight,Shell_Weight,Male,Female,Infant
1430,0.23,0.165,0.06,0.0515,0.019,0.0145,0.036,0,0,1
2496,0.52,0.4,0.13,0.6245,0.215,0.2065,0.17,0,1,0
472,0.43,0.34,0.12,0.3575,0.151,0.0645,0.1045,0,0,1
2231,0.505,0.4,0.165,0.729,0.2675,0.155,0.25,0,1,0
3352,0.52,0.405,0.145,0.829,0.3535,0.1685,0.205,0,1,0


In [None]:
y_train.head()

1430     5.5
2496    16.5
472     10.5
2231    10.5
3352    16.5
Name: Rings, dtype: float64

In [None]:
model = keras.Sequential([
    layers.Dense(128, activation=tf.nn.relu, input_shape=[len(X_train.keys())]),
    layers.Dense(64, activation=tf.nn.relu),
    layers.Dense(1)
  ])

optimizer = tf.train.AdamOptimizer()
#optimizer = tf.train.RMSPropOptimizer(0.001)
model.compile(optimizer=optimizer,
             loss="mse",metrics=['mae', 'mse'])



In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1408      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 9,729
Trainable params: 9,729
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, y_train, epochs=1000, validation_split = 0.2, verbose=0)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

loss = history.history['mean_absolute_error']
val_loss = history.history['val_mean_absolute_error']

epochs = range(1, len(loss) + 1)
plt.gca().set_ylim(0,5)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training mae')
# r is for "solid red line"
plt.plot(epochs, val_loss, 'r', label='Validation mae')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
loss, mae, mse = model.evaluate(X_test, y_test, verbose=1)

print()
print('LOSS:', loss)
print ('MEAN ABSOLUTE ERROR (MAE): ', mae)
print ('MEAN SQUARED ERROR (MSE): ', mse)

In [None]:
test_predictions = model.predict(X_test).flatten()

In [None]:
error = test_predictions - y_test

In [None]:
plt.hist(error, bins = 100)
plt.xlabel("Prediction Error [AGE]")
_ = plt.ylabel("Count")