In [1]:
import os
import numpy as np
import tensorflow as tf

# Check that we have correct TensorFlow version installed
tf_version = tf.__version__
print("TensorFlow version: {}".format(tf_version))
assert "1.4" <= tf_version, "TensorFlow r1.4 or later is needed"


TensorFlow version: 1.5.0


In [2]:
PATH = os.environ['PWD']

PATH_DATA = PATH + os.sep + "data"
FILE_TRAIN = PATH_DATA + os.sep + "training_cleaned_2.csv"
print(FILE_TRAIN)
FILE_TEST = PATH_DATA + os.sep + "validation_cleaned_2.csv"
print(FILE_TEST)

PATH_GRAPH = PATH + os.sep + "graphs"

BATCH_SIZE = 100
TRAIN_EPOCHS = 500

tf.logging.set_verbosity(tf.logging.INFO)

C:/Wks/GitHub/MeTaNoV/PubNativeTest\data\training_cleaned_2.csv
C:/Wks/GitHub/MeTaNoV/PubNativeTest\data\validation_cleaned_2.csv


In [3]:
# Let's list the feature that we will use in accordance to our cleaned datasets
# feature_names = ['v1','v2','v3','v4','v6','v7','v8','v9','v10','v11','v12','v14','v15']
# csv_defaults = [['b'],[0.],[0.],['u'],['c'],['v'],[0.],['t'],['t'],[0],['t'],[0],[0],['yes.']]
feature_names = ['v1','v2','v3','v7','v8','v9','v10','v11','v12','v15']
csv_defaults = [['b'],[0.],[0.],['v'],[0.],['t'],['t'],[0],['t'],[0],['yes.']]

# Our classifiers are using an input_fn to get batch input in their respective training/evaluation/prediction phase
def pn_input_fn(file_path, perform_shuffle=False, repeat_count=1):
    def decode_csv(line):
        """Convert a CSV row to a dictonary of features and a label"""
        parsed_line = tf.decode_csv(line, csv_defaults)
        label = parsed_line[-1]  # Last element is the label
        del parsed_line[-1]  # Delete last element
        features = parsed_line
        return dict(zip(feature_names, features)), label

    dataset = (tf.data.TextLineDataset(file_path)
               .skip(1)  # Skip header row
               .map(decode_csv))
    if perform_shuffle is True:
        dataset = dataset.shuffle(buffer_size=512)
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    dataset = dataset.batch(BATCH_SIZE)
    
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    
    return batch_features, batch_labels

In [4]:
# Let's now create our features columns as described in the data analysis
# Since a DNN only accept Dense columns, we will wrap our Categorical or Bucketized columns into indicator columns
# If we had buckets with big sizes, we would have used embeddings columns instead
feature_columns = [
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v1", 
            vocabulary_list=["a", "b"])
    ),
    tf.feature_column.numeric_column('v2'),
    tf.feature_column.numeric_column('v3'),
#     tf.feature_column.indicator_column(
#         tf.feature_column.categorical_column_with_vocabulary_list(
#             key="v4", 
#             vocabulary_list=["u", "y", "l"])
#     ),
#     tf.feature_column.indicator_column(
#         tf.feature_column.categorical_column_with_vocabulary_list(
#             key="v6", 
#             vocabulary_list=["c", "q", "W", "cc", "x", "aa", "i", "m", "k", "e", "ff", "d", "j", "r"])
#     ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v7", 
            vocabulary_list=["v", "h", "bb", "ff", "z", "j", "n", "dd", "o"])
    ),
    tf.feature_column.numeric_column('v8'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v9", 
            vocabulary_list=["f", "t"])
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v10", 
            vocabulary_list=["f", "t"])
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.bucketized_column(
            tf.feature_column.numeric_column('v11'),
            list(np.linspace(1.,20.,20)))
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="v12", 
            vocabulary_list=["f", "t"])
    ),
#     tf.feature_column.indicator_column(
#         tf.feature_column.bucketized_column(
#             tf.feature_column.numeric_column('v14'),
#             list(np.linspace(50.,500.,10)))
#     ),
    tf.feature_column.indicator_column(
        tf.feature_column.bucketized_column(
            tf.feature_column.numeric_column('v15'),
            list(np.linspace(500.,5000.,10)))
    ),
]

In [5]:
# Now we can define our classifier(s) and perform the associated training and evaluation
# Note: by default, it performs a binary classification, which fits our need, we will just give the label vocabulary
# since we didn't transform our label column previously

# For the linear classifier, we obtain an accuracy in training of 95% and an evaluation accuracy of 67%
# It is not a surprise that the linear classifier is not able to train properly and evaluate even more poorly,
# but we had to quickly assert it.
# classifier = tf.estimator.LinearClassifier(
#     feature_columns=feature_columns,
#     label_vocabulary=["yes.", "no."],
#     model_dir=PATH_GRAPH)

# To be able to capture the non-linearity of the given dataset, let's then focus on a DNN architecture
classifier = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[512, 256, 128],
#     dropout=0.5,
#     optimizer=tf.train.ProximalAdagradOptimizer(
#       learning_rate=0.1,
#       l1_regularization_strength=0.001
#     ),
    label_vocabulary=["yes.", "no."],
    model_dir=PATH_GRAPH)

training_results = classifier.train(
    input_fn=lambda: pn_input_fn(FILE_TRAIN, True, TRAIN_EPOCHS)
)
tf.logging.info("Training done!")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:/Wks/GitHub/MeTaNoV/PubNativeTest\\graphs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001670C892828>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:/Wks/GitHub/MeTaNoV/PubNativeTest\graphs\model.ckpt.
INFO:tensorflow:loss = 70.77728, step = 1
INFO:tensorflow:global_step/sec: 76.896
INFO:tensorflow:loss = 0.40806323, step = 101 (1.302 sec)
INFO:tensorflow:global_step/sec: 81.2828
INFO:tensorflow:loss = 1.2597532, step = 201 (1.230 sec)
INFO:te

INFO:tensorflow:loss = 0.04247649, step = 7501 (1.170 sec)
INFO:tensorflow:global_step/sec: 81.8163
INFO:tensorflow:loss = 0.08897162, step = 7601 (1.222 sec)
INFO:tensorflow:global_step/sec: 83.8102
INFO:tensorflow:loss = 0.014997469, step = 7701 (1.194 sec)
INFO:tensorflow:global_step/sec: 84.2349
INFO:tensorflow:loss = 0.06940451, step = 7801 (1.187 sec)
INFO:tensorflow:global_step/sec: 81.2167
INFO:tensorflow:loss = 0.1349801, step = 7901 (1.231 sec)
INFO:tensorflow:global_step/sec: 81.7492
INFO:tensorflow:loss = 0.1704191, step = 8001 (1.224 sec)
INFO:tensorflow:global_step/sec: 83.3897
INFO:tensorflow:loss = 0.08201565, step = 8101 (1.198 sec)
INFO:tensorflow:global_step/sec: 83.3897
INFO:tensorflow:loss = 0.046640746, step = 8201 (1.199 sec)
INFO:tensorflow:global_step/sec: 80.5606
INFO:tensorflow:loss = 0.027927004, step = 8301 (1.240 sec)
INFO:tensorflow:global_step/sec: 81.8834
INFO:tensorflow:loss = 0.030244626, step = 8401 (1.222 sec)
INFO:tensorflow:global_step/sec: 77.978

INFO:tensorflow:loss = 0.0016612812, step = 15601 (1.234 sec)
INFO:tensorflow:global_step/sec: 80.0434
INFO:tensorflow:loss = 0.0010581437, step = 15701 (1.250 sec)
INFO:tensorflow:global_step/sec: 81.2829
INFO:tensorflow:loss = 0.0015893449, step = 15801 (1.230 sec)
INFO:tensorflow:global_step/sec: 81.8835
INFO:tensorflow:loss = 0.00689167, step = 15901 (1.219 sec)
INFO:tensorflow:global_step/sec: 83.8101
INFO:tensorflow:loss = 0.050842423, step = 16001 (1.193 sec)
INFO:tensorflow:global_step/sec: 81.0846
INFO:tensorflow:loss = 0.017099693, step = 16101 (1.234 sec)
INFO:tensorflow:global_step/sec: 82.8357
INFO:tensorflow:loss = 0.0014028571, step = 16201 (1.206 sec)
INFO:tensorflow:global_step/sec: 78.966
INFO:tensorflow:loss = 0.018045746, step = 16301 (1.267 sec)
INFO:tensorflow:global_step/sec: 84.9524
INFO:tensorflow:loss = 0.035977542, step = 16401 (1.176 sec)
INFO:tensorflow:global_step/sec: 85.3158
INFO:tensorflow:loss = 0.00093148724, step = 16501 (1.173 sec)
INFO:tensorflow:g

In [6]:
training_results = classifier.evaluate(
    input_fn=lambda: pn_input_fn(FILE_TRAIN, False, 1)
)
tf.logging.info("Training results:")
for key in training_results:
    tf.logging.info("   {}, was: {}".format(key, training_results[key]))

# Using a DNN Classifier, we were able to train our model properly
# Let's now confirm that our model is valid with the evaluation phase on our validation dataset
evaluate_results = classifier.evaluate(
    input_fn=lambda: pn_input_fn(FILE_TEST, False, 1)
)
tf.logging.info("Evaluation results:")
for key in evaluate_results:
    tf.logging.info("   {}, was: {}".format(key, evaluate_results[key]))

INFO:tensorflow:Starting evaluation at 2018-02-05-10:44:23
INFO:tensorflow:Restoring parameters from C:/Wks/GitHub/MeTaNoV/PubNativeTest\graphs\model.ckpt-17610
INFO:tensorflow:Finished evaluation at 2018-02-05-10:44:24
INFO:tensorflow:Saving dict for global step 17610: accuracy = 0.99971604, accuracy_baseline = 0.9267462, auc = 1.0, auc_precision_recall = 1.0, average_loss = 0.0005681569, global_step = 17610, label/mean = 0.07325383, loss = 0.055584684, prediction/mean = 0.0730703
INFO:tensorflow:Training results:
INFO:tensorflow:   accuracy, was: 0.99971604347229
INFO:tensorflow:   accuracy_baseline, was: 0.9267461895942688
INFO:tensorflow:   auc, was: 1.0
INFO:tensorflow:   auc_precision_recall, was: 1.0
INFO:tensorflow:   average_loss, was: 0.0005681568873114884
INFO:tensorflow:   label/mean, was: 0.07325383275747299
INFO:tensorflow:   loss, was: 0.055584684014320374
INFO:tensorflow:   prediction/mean, was: 0.07307030260562897
INFO:tensorflow:   global_step, was: 17610
INFO:tensorf

*Conclusion1*: After several fine tuning, we managed to have an accuracy of 82% and an AUC precision/recall of 0.88 which is encouraging since as we mentioned in the data analysis, the model learned with an unbalanced data set and still managed to perform rather well on the validation set. We will do a deeper analysis of the data in `Deeper_Data_Exploration.ipynd`

*Conclusion2*: The second analysis led us to ignore some more features, namely `v4`, `v6` and `v14` which enabled us to reach an accuracy of nearly 85% and an AUC precision/recall of 0.91

Worth to Note:
=> v10 = (v11 != 0)
=> 

*Final Conclusion*: A better understanding of the data, knowing to what correspond the given features would certainly help, i.e. we could perhaps derive some better feature out of it. Moreover, as we mentioned in the first data analysis, the training and validation dataset are not balanced equally, and therefore, the model is not able to correct this bias probably introduced in the training phase. Also, having more data would certainly help further.
