In [1]:
import os
import numpy as np
import tensorflow as tf

# Check that we have correct TensorFlow version installed
tf_version = tf.__version__
print("TensorFlow version: {}".format(tf_version))
assert "1.4" <= tf_version, "TensorFlow r1.4 or later is needed"


TensorFlow version: 1.5.0


In [3]:
PATH = os.environ['PWD']

PATH_DATA = PATH + os.sep + "data"
FILE_TRAIN = PATH_DATA + os.sep + "training_cleaned_2.csv"
print(FILE_TRAIN)
FILE_TEST = PATH_DATA + os.sep + "validation_cleaned_2.csv"
print(FILE_TEST)

PATH_GRAPH = PATH + os.sep + "model"

BATCH_SIZE = 1000
TRAIN_EPOCHS = 500

tf.logging.set_verbosity(tf.logging.INFO)

C:/Wks/GitHub/MeTaNoV/PubNativeTest\data\training_cleaned_2.csv
C:/Wks/GitHub/MeTaNoV/PubNativeTest\data\validation_cleaned_2.csv


In [4]:
# Let's list the feature that we will use in accordance to our cleaned datasets
# feature_names = ['v1','v2','v3','v4','v6','v7','v8','v9','v10','v11','v12','v14','v15']
# csv_defaults = [['b'],[0.],[0.],['u'],['c'],['v'],[0.],['t'],['t'],[0],['t'],[0],[0],['yes.']]
feature_names = ['v1','v2','v3','v7','v8','v9','v10','v11','v12','v15']
csv_defaults = [['b'],[0.],[0.],['v'],[0.],['t'],['t'],[0],['t'],[0],['yes.']]

# Our classifiers are using an input_fn to get batch input in their respective training/evaluation/prediction phase
def pn_input_fn(file_path, perform_shuffle=False, repeat_count=1):
    def decode_csv(line):
        """Convert a CSV row to a dictonary of features and a label"""
        parsed_line = tf.decode_csv(line, csv_defaults)
        label = parsed_line[-1]  # Last element is the label
        del parsed_line[-1]  # Delete last element
        features = parsed_line
        return dict(zip(feature_names, features)), label

    dataset = (tf.data.TextLineDataset(file_path)
               .skip(1)  # Skip header row
               .map(decode_csv))
    if perform_shuffle is True:
        dataset = dataset.shuffle(buffer_size=512)
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    dataset = dataset.batch(BATCH_SIZE)
    
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    
    return batch_features, batch_labels

In [5]:
# Let's now create our features columns as described in the data analysis
# Since a DNN only accept Dense columns, we will wrap our Categorical or Bucketized columns into indicator columns
# If we had buckets with big sizes, we would have used embeddings columns instead
feature_columns = [
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v1", 
            vocabulary_list=["a", "b"])
    ),
    tf.feature_column.numeric_column('v2'),
    tf.feature_column.numeric_column('v3'),
#     tf.feature_column.indicator_column(
#         tf.feature_column.categorical_column_with_vocabulary_list(
#             key="v4", 
#             vocabulary_list=["u", "y", "l"])
#     ),
#     tf.feature_column.indicator_column(
#         tf.feature_column.categorical_column_with_vocabulary_list(
#             key="v6", 
#             vocabulary_list=["c", "q", "W", "cc", "x", "aa", "i", "m", "k", "e", "ff", "d", "j", "r"])
#     ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v7", 
            vocabulary_list=["v", "h", "bb", "ff", "z", "j", "n", "dd", "o"])
    ),
    tf.feature_column.numeric_column('v8'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v9", 
            vocabulary_list=["f", "t"])
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v10", 
            vocabulary_list=["f", "t"])
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.bucketized_column(
            tf.feature_column.numeric_column('v11'),
            list(np.linspace(1.,20.,20)))
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v12", 
            vocabulary_list=["f", "t"])
    ),
#     tf.feature_column.indicator_column(
#         tf.feature_column.bucketized_column(
#             tf.feature_column.numeric_column('v14'),
#             list(np.linspace(50.,500.,10)))
#     ),
    tf.feature_column.indicator_column(
        tf.feature_column.bucketized_column(
            tf.feature_column.numeric_column('v15'),
            list(np.linspace(500.,5000.,10)))
    ),
]

In [6]:
# Now we can define our classifier(s) and perform the associated training and evaluation
# Note: by default, it performs a binary classification, which fits our need, we will just give the label vocabulary
# since we didn't transform our label column previously

# For the linear classifier, we obtain an accuracy in training of 95% and an evaluation accuracy of 67%
# It is not a surprise that the linear classifier is not able to train properly and evaluate even more poorly,
# but we had to quickly assert it.
# classifier = tf.estimator.LinearClassifier(
#     feature_columns=feature_columns,
#     label_vocabulary=["yes.", "no."],
#     model_dir=PATH_GRAPH)

# To be able to capture the non-linearity of the given dataset, let's then focus on a DNN architecture
classifier = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[512, 256, 128],
#     dropout=0.5,
#     optimizer=tf.train.ProximalAdagradOptimizer(
#       learning_rate=0.1,
#       l1_regularization_strength=0.001
#     ),
    label_vocabulary=["yes.", "no."],
    model_dir=PATH_GRAPH)

training_results = classifier.train(
    input_fn=lambda: pn_input_fn(FILE_TRAIN, True, TRAIN_EPOCHS)
)
tf.logging.info("Training done!")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:/Wks/GitHub/MeTaNoV/PubNativeTest\\model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000027CDB80F278>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from C:/Wks/GitHub/MeTaNoV/PubNativeTest\model\model.ckpt-17610
INFO:tensorflow:Saving checkpoints for 17611 into C:/Wks/GitHub/MeTaNoV/PubNativeTest\model\model.ckpt.
INFO:tensorflow:loss = 0.047742862, step = 17611
INFO:tensorflow:global_step/sec: 77.6164
INFO:tensorflow:loss = 0.0027925815, step = 17711 (1

INFO:tensorflow:loss = 0.00866994, step = 24811 (1.203 sec)
INFO:tensorflow:global_step/sec: 82.493
INFO:tensorflow:loss = 0.00024839828, step = 24911 (1.212 sec)
INFO:tensorflow:global_step/sec: 82.4329
INFO:tensorflow:loss = 0.0038940876, step = 25011 (1.212 sec)
INFO:tensorflow:global_step/sec: 85.0973
INFO:tensorflow:loss = 0.004941139, step = 25111 (1.175 sec)
INFO:tensorflow:global_step/sec: 81.283
INFO:tensorflow:loss = 0.00695542, step = 25211 (1.230 sec)
INFO:tensorflow:global_step/sec: 82.4929
INFO:tensorflow:loss = 0.00067919167, step = 25311 (1.212 sec)
INFO:tensorflow:global_step/sec: 80.5788
INFO:tensorflow:loss = 0.0063858423, step = 25411 (1.241 sec)
INFO:tensorflow:global_step/sec: 80.6911
INFO:tensorflow:loss = 0.010834886, step = 25511 (1.240 sec)
INFO:tensorflow:global_step/sec: 80.6257
INFO:tensorflow:loss = 0.0007967269, step = 25611 (1.239 sec)
INFO:tensorflow:global_step/sec: 79.4695
INFO:tensorflow:loss = 0.0036963744, step = 25711 (1.260 sec)
INFO:tensorflow:g

INFO:tensorflow:loss = 0.0022189138, step = 32811 (1.494 sec)
INFO:tensorflow:global_step/sec: 66.7117
INFO:tensorflow:loss = 0.0023969088, step = 32911 (1.499 sec)
INFO:tensorflow:global_step/sec: 69.1158
INFO:tensorflow:loss = 0.0027298592, step = 33011 (1.450 sec)
INFO:tensorflow:global_step/sec: 68.0315
INFO:tensorflow:loss = 0.038079202, step = 33111 (1.468 sec)
INFO:tensorflow:global_step/sec: 63.8094
INFO:tensorflow:loss = 0.0003500441, step = 33211 (1.567 sec)
INFO:tensorflow:global_step/sec: 73.6046
INFO:tensorflow:loss = 0.0002462499, step = 33311 (1.358 sec)
INFO:tensorflow:global_step/sec: 75.3279
INFO:tensorflow:loss = 0.0018636029, step = 33411 (1.328 sec)
INFO:tensorflow:global_step/sec: 77.3801
INFO:tensorflow:loss = 0.0003939586, step = 33511 (1.292 sec)
INFO:tensorflow:global_step/sec: 73.673
INFO:tensorflow:loss = 0.00012879932, step = 33611 (1.357 sec)
INFO:tensorflow:global_step/sec: 66.4894
INFO:tensorflow:loss = 0.001975031, step = 33711 (1.505 sec)
INFO:tensorfl

INFO:tensorflow:global_step/sec: 78.597
INFO:tensorflow:loss = 0.00016290527, step = 40811 (1.274 sec)
INFO:tensorflow:global_step/sec: 80.0434
INFO:tensorflow:loss = 0.00037415436, step = 40911 (1.248 sec)
INFO:tensorflow:global_step/sec: 68.5276
INFO:tensorflow:loss = 0.001393319, step = 41011 (1.462 sec)
INFO:tensorflow:global_step/sec: 74.8192
INFO:tensorflow:loss = 0.0015508217, step = 41111 (1.335 sec)
INFO:tensorflow:global_step/sec: 74.2069
INFO:tensorflow:loss = 0.0015945348, step = 41211 (1.350 sec)
INFO:tensorflow:global_step/sec: 68.171
INFO:tensorflow:loss = 0.00067327416, step = 41311 (1.468 sec)
INFO:tensorflow:global_step/sec: 68.311
INFO:tensorflow:loss = 0.00017130563, step = 41411 (1.461 sec)
INFO:tensorflow:global_step/sec: 64.0554
INFO:tensorflow:loss = 0.0002291294, step = 41511 (1.561 sec)
INFO:tensorflow:global_step/sec: 64.0141
INFO:tensorflow:loss = 0.0002150414, step = 41611 (1.562 sec)
INFO:tensorflow:global_step/sec: 58.6669
INFO:tensorflow:loss = 0.0006344

INFO:tensorflow:loss = 0.00025058887, step = 48711 (1.479 sec)
INFO:tensorflow:global_step/sec: 69.3079
INFO:tensorflow:loss = 0.00047316973, step = 48811 (1.444 sec)
INFO:tensorflow:global_step/sec: 74.7632
INFO:tensorflow:loss = 0.0006666596, step = 48911 (1.338 sec)
INFO:tensorflow:global_step/sec: 75.1576
INFO:tensorflow:loss = 0.00060277816, step = 49011 (1.331 sec)
INFO:tensorflow:global_step/sec: 72.1143
INFO:tensorflow:loss = 0.00032566884, step = 49111 (1.386 sec)
INFO:tensorflow:global_step/sec: 71.0357
INFO:tensorflow:loss = 0.0002077094, step = 49211 (1.409 sec)
INFO:tensorflow:global_step/sec: 70.5832
INFO:tensorflow:loss = 0.00036333807, step = 49311 (1.417 sec)
INFO:tensorflow:global_step/sec: 63.1228
INFO:tensorflow:loss = 0.00022138639, step = 49411 (1.583 sec)
INFO:tensorflow:global_step/sec: 64.3861
INFO:tensorflow:loss = 0.00021429663, step = 49511 (1.553 sec)
INFO:tensorflow:global_step/sec: 74.5546
INFO:tensorflow:loss = 0.000854789, step = 49611 (1.341 sec)
INFO:

INFO:tensorflow:global_step/sec: 57.985
INFO:tensorflow:loss = 0.0017274846, step = 56711 (1.727 sec)
INFO:tensorflow:global_step/sec: 61.5263
INFO:tensorflow:loss = 0.00094212114, step = 56811 (1.624 sec)
INFO:tensorflow:global_step/sec: 69.3561
INFO:tensorflow:loss = 0.0003596938, step = 56911 (1.440 sec)
INFO:tensorflow:global_step/sec: 65.4423
INFO:tensorflow:loss = 0.0002470036, step = 57011 (1.529 sec)
INFO:tensorflow:global_step/sec: 62.6865
INFO:tensorflow:loss = 2.1735303e-05, step = 57111 (1.596 sec)
INFO:tensorflow:global_step/sec: 58.495
INFO:tensorflow:loss = 0.00024293768, step = 57211 (1.710 sec)
INFO:tensorflow:global_step/sec: 55.4695
INFO:tensorflow:loss = 0.00059779204, step = 57311 (1.802 sec)
INFO:tensorflow:global_step/sec: 64.1377
INFO:tensorflow:loss = 0.0011286768, step = 57411 (1.560 sec)
INFO:tensorflow:global_step/sec: 54.8457
INFO:tensorflow:loss = 0.00020632135, step = 57511 (1.822 sec)
INFO:tensorflow:global_step/sec: 48.2648
INFO:tensorflow:loss = 0.0001

INFO:tensorflow:loss = 0.00037127885, step = 64511 (1.522 sec)
INFO:tensorflow:global_step/sec: 66.3567
INFO:tensorflow:loss = 0.00040343916, step = 64611 (1.507 sec)
INFO:tensorflow:global_step/sec: 65.9632
INFO:tensorflow:loss = 0.00013698102, step = 64711 (1.516 sec)
INFO:tensorflow:global_step/sec: 58.6671
INFO:tensorflow:loss = 4.8124024e-05, step = 64811 (1.705 sec)
INFO:tensorflow:global_step/sec: 59.9481
INFO:tensorflow:loss = 0.00032889415, step = 64911 (1.668 sec)
INFO:tensorflow:global_step/sec: 65.8235
INFO:tensorflow:loss = 0.00019281718, step = 65011 (1.518 sec)
INFO:tensorflow:global_step/sec: 68.8771
INFO:tensorflow:loss = 0.00022577737, step = 65111 (1.454 sec)
INFO:tensorflow:global_step/sec: 68.0315
INFO:tensorflow:loss = 6.81236e-05, step = 65211 (1.470 sec)
INFO:tensorflow:global_step/sec: 66.6517
INFO:tensorflow:loss = 0.00015093337, step = 65311 (1.499 sec)
INFO:tensorflow:global_step/sec: 62.2949
INFO:tensorflow:loss = 0.00045068812, step = 65411 (1.605 sec)
INF

INFO:tensorflow:global_step/sec: 57.1215
INFO:tensorflow:loss = 0.00057573034, step = 72511 (1.753 sec)
INFO:tensorflow:global_step/sec: 54.5591
INFO:tensorflow:loss = 0.00016538141, step = 72611 (1.831 sec)
INFO:tensorflow:global_step/sec: 77.1937
INFO:tensorflow:loss = 6.315611e-05, step = 72711 (1.294 sec)
INFO:tensorflow:global_step/sec: 77.1936
INFO:tensorflow:loss = 0.00030169342, step = 72811 (1.296 sec)
INFO:tensorflow:global_step/sec: 74.0417
INFO:tensorflow:loss = 0.00017944304, step = 72911 (1.351 sec)
INFO:tensorflow:global_step/sec: 73.7132
INFO:tensorflow:loss = 0.00045882494, step = 73011 (1.359 sec)
INFO:tensorflow:global_step/sec: 66.9357
INFO:tensorflow:loss = 0.00018287, step = 73111 (1.493 sec)
INFO:tensorflow:global_step/sec: 73.1188
INFO:tensorflow:loss = 0.0002695566, step = 73211 (1.368 sec)
INFO:tensorflow:global_step/sec: 70.9346
INFO:tensorflow:loss = 0.00029395093, step = 73311 (1.410 sec)
INFO:tensorflow:global_step/sec: 73.4419
INFO:tensorflow:loss = 0.000

INFO:tensorflow:loss = 8.214471e-05, step = 80411 (1.501 sec)
INFO:tensorflow:global_step/sec: 68.9247
INFO:tensorflow:loss = 0.00019896963, step = 80511 (1.449 sec)
INFO:tensorflow:global_step/sec: 58.3241
INFO:tensorflow:loss = 0.0002550467, step = 80611 (1.721 sec)
INFO:tensorflow:global_step/sec: 52.7414
INFO:tensorflow:loss = 0.00039693623, step = 80711 (1.893 sec)
INFO:tensorflow:global_step/sec: 61.9083
INFO:tensorflow:loss = 0.00010085904, step = 80811 (1.613 sec)
INFO:tensorflow:global_step/sec: 68.0314
INFO:tensorflow:loss = 0.00017875433, step = 80911 (1.470 sec)
INFO:tensorflow:global_step/sec: 69.6955
INFO:tensorflow:loss = 0.00030356477, step = 81011 (1.434 sec)
INFO:tensorflow:global_step/sec: 73.877
INFO:tensorflow:loss = 9.153331e-05, step = 81111 (1.355 sec)
INFO:tensorflow:global_step/sec: 66.4008
INFO:tensorflow:loss = 5.6868597e-05, step = 81211 (1.506 sec)
INFO:tensorflow:global_step/sec: 72.1145
INFO:tensorflow:loss = 0.00014029798, step = 81311 (1.387 sec)
INFO:

In [6]:
training_results = classifier.evaluate(
    input_fn=lambda: pn_input_fn(FILE_TRAIN, False, 1)
)
tf.logging.info("Training results:")
for key in training_results:
    tf.logging.info("   {}, was: {}".format(key, training_results[key]))

# Using a DNN Classifier, we were able to train our model properly
# Let's now confirm that our model is valid with the evaluation phase on our validation dataset
evaluate_results = classifier.evaluate(
    input_fn=lambda: pn_input_fn(FILE_TEST, False, 1)
)
tf.logging.info("Evaluation results:")
for key in evaluate_results:
    tf.logging.info("   {}, was: {}".format(key, evaluate_results[key]))

INFO:tensorflow:Starting evaluation at 2018-02-05-10:44:23
INFO:tensorflow:Restoring parameters from C:/Wks/GitHub/MeTaNoV/PubNativeTest\graphs\model.ckpt-17610
INFO:tensorflow:Finished evaluation at 2018-02-05-10:44:24
INFO:tensorflow:Saving dict for global step 17610: accuracy = 0.99971604, accuracy_baseline = 0.9267462, auc = 1.0, auc_precision_recall = 1.0, average_loss = 0.0005681569, global_step = 17610, label/mean = 0.07325383, loss = 0.055584684, prediction/mean = 0.0730703
INFO:tensorflow:Training results:
INFO:tensorflow:   accuracy, was: 0.99971604347229
INFO:tensorflow:   accuracy_baseline, was: 0.9267461895942688
INFO:tensorflow:   auc, was: 1.0
INFO:tensorflow:   auc_precision_recall, was: 1.0
INFO:tensorflow:   average_loss, was: 0.0005681568873114884
INFO:tensorflow:   label/mean, was: 0.07325383275747299
INFO:tensorflow:   loss, was: 0.055584684014320374
INFO:tensorflow:   prediction/mean, was: 0.07307030260562897
INFO:tensorflow:   global_step, was: 17610
INFO:tensorf

*Conclusion1*: After several fine tuning, we managed to have an accuracy of 82% and an AUC precision/recall of 0.88 which is encouraging since as we mentioned in the data analysis, the model learned with an unbalanced data set and still managed to perform rather well on the validation set. We will do a deeper analysis of the data in `Deeper_Data_Exploration.ipynd`

*Conclusion2*: The second analysis led us to ignore some more features, namely `v4`, `v6` and `v14` which enabled us to reach an accuracy of nearly 85% and an AUC precision/recall of 0.91

*Final Conclusion*: A better understanding of the data, knowing to what correspond the given features would certainly help, i.e. we could perhaps derive some better feature out of it. Moreover, as we mentioned in the first data analysis, the training and validation dataset are not balanced equally, and therefore, the model is not able to correct this bias probably introduced in the training phase. Also, having more data would certainly help further.
