In [46]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [47]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
import pandas as pd
import io
import requests
import numpy as np
from sklearn import metrics
path = "./t81_558_deep_learning/data/"

filename_read = os.path.join(path,"auto-mpg.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

cars = df['name']
df.drop('name',1,inplace=True)
missing_median(df, 'horsepower')
x,y = to_xy(df,"mpg")
model = Sequential()
model.add(Dense(25, input_dim=x.shape[1], activation='relu')) # Hidden 1
model.add(Dense(10, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x,y,verbose=2,epochs=100)

Epoch 1/100
 - 0s - loss: 338225.8308
Epoch 2/100
 - 0s - loss: 69145.4303
Epoch 3/100
 - 0s - loss: 3187.9811
Epoch 4/100
 - 0s - loss: 2676.6318
Epoch 5/100
 - 0s - loss: 1720.5217
Epoch 6/100
 - 0s - loss: 375.9463
Epoch 7/100
 - 0s - loss: 364.6907
Epoch 8/100
 - 0s - loss: 308.3284
Epoch 9/100
 - 0s - loss: 285.8365
Epoch 10/100
 - 0s - loss: 281.6442
Epoch 11/100
 - 0s - loss: 273.7608
Epoch 12/100
 - 0s - loss: 267.8063
Epoch 13/100
 - 0s - loss: 260.9089
Epoch 14/100
 - 0s - loss: 254.6049
Epoch 15/100
 - 0s - loss: 247.2174
Epoch 16/100
 - 0s - loss: 240.7996
Epoch 17/100
 - 0s - loss: 233.2770
Epoch 18/100
 - 0s - loss: 227.5043
Epoch 19/100
 - 0s - loss: 220.2418
Epoch 20/100
 - 0s - loss: 214.0966
Epoch 21/100
 - 0s - loss: 206.3374
Epoch 22/100
 - 0s - loss: 200.4685
Epoch 23/100
 - 0s - loss: 194.2222
Epoch 24/100
 - 0s - loss: 188.7170
Epoch 25/100
 - 0s - loss: 183.0553
Epoch 26/100
 - 0s - loss: 177.8629
Epoch 27/100
 - 0s - loss: 174.0425
Epoch 28/100
 - 0s - loss: 16

<keras.callbacks.History at 0x1a1cacf0f0>

In [29]:
pred = model.predict(x)
print("Shape: {}".format(pred.shape))
print(pred)

Shape: (398, 1)
[[ 11.133534  ]
 [  7.213552  ]
 [  8.151113  ]
 [ 10.002355  ]
 [  9.830922  ]
 [  3.5613148 ]
 [ -0.0639293 ]
 [  0.7476857 ]
 [  0.87369275]
 [  2.1957448 ]
 [  1.9396412 ]
 [  5.5124564 ]
 [  2.6202137 ]
 [-11.066035  ]
 [ 24.527288  ]
 [ 19.081747  ]
 [ 18.386282  ]
 [ 17.386343  ]
 [ 23.81719   ]
 [ 25.578741  ]
 [ 28.489126  ]
 [ 24.721014  ]
 [ 26.08335   ]
 [ 19.615072  ]
 [ 17.238455  ]
 [ 14.054875  ]
 [ 17.970465  ]
 [ 15.739995  ]
 [ 23.756903  ]
 [ 23.907614  ]
 [ 20.111425  ]
 [ 22.74346   ]
 [ 23.961401  ]
 [ 12.600942  ]
 [ 21.704565  ]
 [ 18.6173    ]
 [ 18.840796  ]
 [ 19.907583  ]
 [ 12.199498  ]
 [  9.191564  ]
 [ 12.966832  ]
 [ 15.306615  ]
 [ 14.940038  ]
 [ 12.194615  ]
 [ 15.514195  ]
 [ 13.041631  ]
 [ 24.157324  ]
 [ 17.903128  ]
 [ 16.797308  ]
 [ 20.64472   ]
 [ 20.95078   ]
 [ 27.937414  ]
 [ 23.846502  ]
 [ 26.661673  ]
 [ 24.398153  ]
 [ 24.233297  ]
 [ 25.588446  ]
 [ 24.164907  ]
 [ 24.15322   ]
 [ 30.95629   ]
 [ 23.846258  ]
 [ 22.26

In [30]:
# Measure RMSE error.  RMSE is common for regression.# Measur 
score = np.sqrt(metrics.mean_squared_error(pred,y))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 6.6894354820251465


In [31]:
# Sample predictions
for i in range(10):
    print("{}. Car name: {}, MPG: {}, predicted MPG: {}".format(i+1,cars[i],y[i],pred[i]))

1. Car name: chevrolet chevelle malibu, MPG: [18.], predicted MPG: [11.133534]
2. Car name: buick skylark 320, MPG: [15.], predicted MPG: [7.213552]
3. Car name: plymouth satellite, MPG: [18.], predicted MPG: [8.151113]
4. Car name: amc rebel sst, MPG: [16.], predicted MPG: [10.002355]
5. Car name: ford torino, MPG: [17.], predicted MPG: [9.830922]
6. Car name: ford galaxie 500, MPG: [15.], predicted MPG: [3.5613148]
7. Car name: chevrolet impala, MPG: [14.], predicted MPG: [-0.0639293]
8. Car name: plymouth fury iii, MPG: [14.], predicted MPG: [0.7476857]
9. Car name: pontiac catalina, MPG: [14.], predicted MPG: [0.87369275]
10. Car name: amc ambassador dpl, MPG: [15.], predicted MPG: [2.1957448]


In [48]:
import pandas as pd
import io
import requests
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

url="https://raw.githubusercontent.com/jeffheaton/t81_558_deep_learning/master/data/iris.csv"
df=pd.read_csv(io.StringIO(requests.get(url).content.decode('utf-8')),na_values=['NA','?'])

species = encode_text_index(df,"species")
x,y = to_xy(df,"species")

# Split into train/test
model = Sequential()
model.add(Dense(50, input_dim=x.shape[1], activation='relu')) # Hidden 1
model.add(Dense(25, activation='relu')) # Hidden 2
model.add(Dense(y.shape[1],activation='softmax')) # Output

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(x,y,verbose=2,epochs=100)

Epoch 1/100
 - 0s - loss: 1.0540
Epoch 2/100
 - 0s - loss: 0.9899
Epoch 3/100
 - 0s - loss: 0.9342
Epoch 4/100
 - 0s - loss: 0.8788
Epoch 5/100
 - 0s - loss: 0.8294
Epoch 6/100
 - 0s - loss: 0.7806
Epoch 7/100
 - 0s - loss: 0.7348
Epoch 8/100
 - 0s - loss: 0.6919
Epoch 9/100
 - 0s - loss: 0.6490
Epoch 10/100
 - 0s - loss: 0.6137
Epoch 11/100
 - 0s - loss: 0.5787
Epoch 12/100
 - 0s - loss: 0.5481
Epoch 13/100
 - 0s - loss: 0.5215
Epoch 14/100
 - 0s - loss: 0.4955
Epoch 15/100
 - 0s - loss: 0.4723
Epoch 16/100
 - 0s - loss: 0.4523
Epoch 17/100
 - 0s - loss: 0.4339
Epoch 18/100
 - 0s - loss: 0.4166
Epoch 19/100
 - 0s - loss: 0.4013
Epoch 20/100
 - 0s - loss: 0.3882
Epoch 21/100
 - 0s - loss: 0.3739
Epoch 22/100
 - 0s - loss: 0.3607
Epoch 23/100
 - 0s - loss: 0.3485
Epoch 24/100
 - 0s - loss: 0.3361
Epoch 25/100
 - 0s - loss: 0.3257
Epoch 26/100
 - 0s - loss: 0.3141
Epoch 27/100
 - 0s - loss: 0.3033
Epoch 28/100
 - 0s - loss: 0.2939
Epoch 29/100
 - 0s - loss: 0.2836
Epoch 30/100
 - 0s - lo

<keras.callbacks.History at 0x1a1de5bf98>

In [33]:
# Print out number of species found:

print(species)

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [34]:
pred  =  model.predict(x)
print("Shape: {}".format(pred.shape))
print(pred)

Shape: (150, 3)
[[0.9964322  0.00356715 0.00000061]
 [0.9900939  0.00990328 0.00000286]
 [0.9943606  0.00563758 0.0000018 ]
 [0.9894545  0.01054133 0.00000418]
 [0.9969482  0.0030512  0.00000056]
 [0.9960377  0.00396173 0.00000051]
 [0.99425215 0.00574565 0.00000213]
 [0.9945686  0.00543025 0.00000113]
 [0.986798   0.01319464 0.0000073 ]
 [0.9916638  0.00833423 0.00000204]
 [0.99720925 0.00279034 0.0000003 ]
 [0.9929294  0.0070687  0.00000191]
 [0.99162465 0.008373   0.0000024 ]
 [0.9948232  0.00517432 0.00000255]
 [0.9992112  0.00078884 0.00000004]
 [0.99902844 0.00097142 0.00000006]
 [0.9982474  0.00175239 0.00000021]
 [0.99584794 0.00415117 0.0000008 ]
 [0.99617577 0.00382391 0.00000032]
 [0.99706036 0.00293912 0.00000049]
 [0.992315   0.00768399 0.00000104]
 [0.9958859  0.00411322 0.00000082]
 [0.9983392  0.00166032 0.00000044]
 [0.9807695  0.01922409 0.00000644]
 [0.98644954 0.0135465  0.00000404]
 [0.9847936  0.01520221 0.00000411]
 [0.99031293 0.00968434 0.00000268]
 [0.99573   

In [35]:
# If you would like to turn of scientific notation, the following line can be used:
np.set_printoptions(suppress=True)

In [36]:
# The to_xy function represented the input in the same way.  Each row has only 1.0 value because each row is only one type# The to 
# of iris.  This is the training data, we KNOW what type of iris it is.  This is called one-hot encoding.  Only one value
# is 1.0 (hot)
print(y[0:10])

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [37]:
# Usually the column (pred) with the highest prediction is considered to be the prediction of the neural network.  It is easy
# to convert the predictions to the expected iris species.  The argmax function finds the index of the maximum prediction
# for each row.
predict_classes = np.argmax(pred,axis=1)
expected_classes = np.argmax(y,axis=1)
print("Predictions: {}".format(predict_classes))
print("Expected: {}".format(expected_classes))

Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1
 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
Expected: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [38]:
# Of course it is very easy to turn these indexes back into iris species.  We just use the species list that we created earlier.
print(species[predict_classes[1:10]])

['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa']


In [39]:
from sklearn.metrics import accuracy_score
# Accuracy might be a more easily understood error metric.  It is essentially a test score.  For all of the iris predictions,
# what percent were correct?  The downside is it does not consider how confident the neural network was in each prediction.
correct = accuracy_score(expected_classes,predict_classes)
print("Accuracy: {}".format(correct))

Accuracy: 0.98


In [40]:
# ad hoc prediction
sample_flower = np.array( [[5.0,3.0,4.0,2.0]], dtype=float)
pred = model.predict(sample_flower)
print(pred)
pred = np.argmax(pred)
print("Predict that {} is: {}".format(sample_flower,species[pred]))

# predict two sample flowers
sample_flower = np.array( [[5.0,3.0,4.0,2.0],[5.2,3.5,1.5,0.8]], dtype=float)
pred = model.predict(sample_flower)
print(pred)
pred = np.argmax(pred,axis=1)
print("Predict that {} is: {}".format(sample_flower,species[pred]))

[[0.00151805 0.2870494  0.7114326 ]]
Predict that [[5. 3. 4. 2.]] is: Iris-virginica
[[0.00151805 0.2870494  0.7114326 ]
 [0.9848395  0.01515483 0.00000569]]
Predict that [[5.  3.  4.  2. ]
 [5.2 3.5 1.5 0.8]] is: ['Iris-virginica' 'Iris-setosa']


In [49]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
import pandas as pd
import io
import requests
import numpy as np
from sklearn import metrics
path = "./t81_558_deep_learning/data/"
save_path = "./t81_558_deep_learning/dnn/"

filename_read = os.path.join(path,"auto-mpg.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

cars = df['name']
df.drop('name',1,inplace=True)
missing_median(df, 'horsepower')
x,y = to_xy(df,"mpg")
model = Sequential()
model.add(Dense(50, input_dim=x.shape[1], activation='relu')) # Hidden 1
model.add(Dense(25, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x,y,verbose=0,epochs=100)

# Predict
pred = model.predict(x)

# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y))
print("Before save score (RMSE): {}".format(score))

# save neural network structure to JSON (no weights)
model_json = model.to_json()
with open(os.path.join(save_path,"network.json"), "w") as json_file:
    json_file.write(model_json)

# save neural network structure to YAML (no weights)
model_yaml = model.to_yaml()
with open(os.path.join(save_path,"network.yaml"), "w") as yaml_file:
    yaml_file.write(model_yaml)

# save entire network to HDF5 (save everything, suggested)
model.save(os.path.join(save_path,"network.h5"))

Before save score (RMSE): 4.65507173538208


In [51]:
from keras.models import load_model
model2 = load_model(os.path.join(save_path,"network.h5"))
pred = model2.predict(x)
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y))
print("After load score (RMSE): {}".format(score))

After load score (RMSE): 4.65507173538208
