In [2]:
#import necessary libraries
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as  tf 
import pandas as pd 
import numpy as np 
import sys
import os

tf.logging.set_verbosity(tf.logging.INFO)

In [3]:
#setting hyperparameters
BATCH_SIZE = 32
epochs = 1000
PATH_DATASET = '<path_to_dataset>'

FILE_TRAIN = PATH_DATASET + "SCALED_TRAINING_DATA.csv"
FILE_TEST  = PATH_DATASET + "SCALED_TEST_DATA.csv"

In [4]:
CSV_COLUMN_NAMES, numerical_features, categorical_features = [], [], []

#read the columns
CSV_COLUMN_NAMES = pd.read_csv(FILE_TRAIN, nrows=1).columns.tolist()
train = pd.read_csv(FILE_TRAIN, names= CSV_COLUMN_NAMES, header=0)
test = pd.read_csv(FILE_TEST, names= CSV_COLUMN_NAMES, header=0)

In [5]:
#select the columns we need
COLUMNS_WE_AVOID = ['K_FILTER_START', 'Total_Run_(hrs)', 'Unnamed: 0']
COLUMNS_WE_NEED = []

for i in CSV_COLUMN_NAMES:

	if i not in COLUMNS_WE_AVOID:
		COLUMNS_WE_NEED.append(i)

FEATURES = COLUMNS_WE_NEED[:-1]
LABEL = COLUMNS_WE_NEED.pop()

In [6]:
#now set the X_trian, X_test
train_x, train_y = train[FEATURES], train[LABEL]
test_x, test_y = test[FEATURES], test[LABEL]

In [7]:
print("Input Features:{}".format(train_x.shape))
print("Label Output:{}".format(train_y.shape))

Input Features:(338, 52)
Label Output:(338,)


In [8]:
print("Test Input Features:{}".format(test_x.shape))
print("Test Label Output:{}".format(test_y.shape))

Test Input Features:(41, 52)
Test Label Output:(41,)


In [9]:
#get the column type and store them in an array
for column in train_x.columns:
	#write conditions here
	if (train_x[column].dtype == np.float64 or train_x[column].dtype == np.int64):
		numerical_features.append(column) #append to the numerical features
	else:
		categorical_features.append(column) #append to the categorical features
        
#building feature columns
feature_columns = [tf.feature_column.numeric_column(k) for k in numerical_features]

#get the unique key values for the categorical variable
for k in categorical_features:
	#get the unique values
	current_bucket = train_x[k].nunique()
	if current_bucket > 10:
		feature_columns.append(
			tf.feature_column.indicator_column(
				tf.feature_column.categorical_column_with_vocabulary_list(
					key = k,
					vocabulary_list = train_x[k].unique()
				)
			)
		)

	else:
		feature_columns.append(
			tf.feature_column.indicator_column(
				# for columns that we want the the library to automatically map values for us
				tf.feature_column.categorical_column_with_hash_bucket(
					key=k,
					vocabulary_list = train[k].unique()
				)
			)
		)

In [10]:
def train_input_fn(features, labels, batch_size, epochs):
	#Slice the tensor into single type.
	#for eg: <name=Tensor0, Shape=(?,1)>
	dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
	dataset = dataset.shuffle(256).repeat(epochs).batch(batch_size)

	return dataset

In [11]:
def eval_input_fn(features, labels, batch_size):
	#get features as dictionary
	features = dict(features)
	#if labels are none
	if labels is None:
		print("Entered into the loop because Label is None")
		inputs = features

	#if there is label, map the input features with labels 
	else:
		inputs = (features, labels)

	#convert into tensors
	dataset = tf.data.Dataset.from_tensor_slices(inputs)
	assert batch_size is not None, "BATCH SIZE MUST NOT BE NONE"
	#split the batchsize
	dataset = dataset.batch(batch_size)
	#return the dataset
	return dataset

In [12]:
def my_model_fn(features, labels, mode, params):

	if mode == tf.estimator.ModeKeys.PREDICT:
		tf.logging.info("My_Model_Fn: PREDICT, {}".format(mode))
	elif mode == tf.estimator.ModeKeys.EVAL:
		tf.logging.info("My_Model_Fn: EVAL, {}".format(mode))
	elif mode == tf.estimator.ModeKeys.TRAIN:
		tf.logging.info("My_Model_Fn: TRAIN, {}".format(mode))

	#setup the initializer
	initializer = tf.contrib.layers.xavier_initializer()
	regularizer = tf.contrib.layers.l2_regularizer(scale=0.1)

	input_layer = tf.feature_column.input_layer(features, feature_columns)
	h1 = tf.layers.Dense(100, activation=tf.nn.relu,
								kernel_regularizer= regularizer,
								kernel_initializer= initializer)(input_layer)

	h2 = tf.layers.Dense(80, activation=tf.nn.relu,
								kernel_regularizer= regularizer,
								kernel_initializer= initializer)(h1)

	h3 = tf.layers.Dense(80, activation=tf.nn.relu,
								kernel_regularizer= regularizer,
								kernel_initializer= initializer)(h2)

	logits = tf.layers.Dense(2)(h3)

	#compute predictions
	predicted_classes = tf.argmax(input= logits, axis=1)
	if mode == tf.estimator. ModeKeys.PREDICT:
		predictions = {
			'class_ids': predicted_classes[:, tf.newaxis],
			'probabilites': tf.nn.softmax(logits),
			'logits': logits
		}

		return tf.estimator.EstimatorSpec(mode, predictions=predictions)


	loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

	accuracy = tf.metrics.accuracy(labels= labels, predictions=predicted_classes, name='acc_op')
	precision = tf.metrics.precision(labels, predictions= predicted_classes, name='precision_op')
	recall = tf.metrics.recall(labels, predictions= predicted_classes, name='recall_op')
	auc = tf.metrics.auc(labels, predictions=predicted_classes, name='auc_op')

	metrics = {
		'accuracy': accuracy,
		'precision': precision,
		'recall': recall,
		'auc': auc
	}

	tf.summary.scalar('my_accuracy', accuracy[1])
	tf.summary.scalar('my_precision', precision[1])
	tf.summary.scalar('my_recall', recall[1])
	tf.summary.scalar('my_auc', auc[1])

	if mode == tf.estimator.ModeKeys.EVAL:

		return tf.estimator.EstimatorSpec(
			mode, loss=loss, eval_metric_ops=metrics)

	#training_op
	assert mode == tf.estimator.ModeKeys.TRAIN, "Train is the only Mode Key"
	optimizer = tf.train.AdagradOptimizer(learning_rate=0.0001)
	train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

	grads = optimizer.compute_gradients(loss)

	for grad, var in grads:
		if grad is not None:
			tf.summary.histogram(var.op.name + "/gradients", grad)

	for var in tf.trainable_variables():
		tf.summary.histogram(var.name, var)


	return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

In [13]:
def serving_input_fn():
   
    for i in FEATURES:
        feature_placeholders.update({i:tf.placeholder(tf.float32, [None])})

    
    features = {
        key: tf.expand_dims(tensor, -1)
        for key, tensor in feature_placeholders.items()
    }
    
    return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)

In [14]:
MODEL_PATH = "<dir_to_save_tensorflow_model>"

classifier = tf.estimator.Estimator(
	model_fn = my_model_fn
)

classifier.train(input_fn=lambda: train_input_fn(train_x, train_y, BATCH_SIZE, epochs))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_service': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000014396F34240>, '_device_fn': None, '_tf_random_seed': None, '_global_id_in_cluster': 0, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_task_id': 0, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_log_step_count_steps': 100, '_master': '', '_num_ps_replicas': 0, '_train_distribute': None, '_num_worker_replicas': 1, '_evaluation_master': '', '_model_dir': 'C:\\Users\\MADHIV~1\\AppData\\Local\\Temp\\tmp9ppav3cz'}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:My_Model_Fn: TRAIN, train
INFO:tensorflow:Summary name dense/kernel:0 is illegal; using dense/kernel_0 instead.
INFO:tensorflow:Summary name dense/bias:0 is illegal; using dense/bias_0 instead.
INFO:tensorflow:Summary name dense

INFO:tensorflow:loss = 0.2672584, step = 6200 (0.585 sec)
INFO:tensorflow:global_step/sec: 173.69
INFO:tensorflow:loss = 0.12525794, step = 6300 (0.574 sec)
INFO:tensorflow:global_step/sec: 164.898
INFO:tensorflow:loss = 0.2827819, step = 6400 (0.607 sec)
INFO:tensorflow:global_step/sec: 173.413
INFO:tensorflow:loss = 0.20719725, step = 6500 (0.576 sec)
INFO:tensorflow:global_step/sec: 166.051
INFO:tensorflow:loss = 0.19833869, step = 6600 (0.602 sec)
INFO:tensorflow:global_step/sec: 181.095
INFO:tensorflow:loss = 0.10445, step = 6700 (0.552 sec)
INFO:tensorflow:global_step/sec: 164.403
INFO:tensorflow:loss = 0.17371762, step = 6800 (0.608 sec)
INFO:tensorflow:global_step/sec: 179.973
INFO:tensorflow:loss = 0.23528865, step = 6900 (0.557 sec)
INFO:tensorflow:global_step/sec: 168.245
INFO:tensorflow:loss = 0.27854252, step = 7000 (0.593 sec)
INFO:tensorflow:global_step/sec: 162.665
INFO:tensorflow:loss = 0.13983281, step = 7100 (0.615 sec)
INFO:tensorflow:global_step/sec: 167.368
INFO:t

<tensorflow.python.estimator.estimator.Estimator at 0x14396f34400>

In [15]:
evaluate_result = classifier.evaluate(input_fn=lambda: eval_input_fn(test_x, test_y, BATCH_SIZE))

print("Evaluation Results:")
print("-" * 75)

for key in evaluate_result:
	print(" {}, was:{}".format(key, evaluate_result[key]))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:My_Model_Fn: EVAL, eval
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-10-16-06:19:45
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\MADHIV~1\AppData\Local\Temp\tmp9ppav3cz\model.ckpt-10563
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-10-16-06:19:46
INFO:tensorflow:Saving dict for global step 10563: accuracy = 0.85365856, auc = 0.4999999, global_step = 10563, loss = 0.68403476, precision = 0.85365856, recall = 1.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10563: C:\Users\MADHIV~1\AppData\Local\Temp\tmp9ppav3cz\model.ckpt-10563
Evaluation Results:
---------------------------------------------------------------------------
 precision, was:0.8536585569381714
 global_step, was:10563
 loss, was:0.6840347647666931
 accuracy, was:0.8536585569381714
 auc, was:0.4999

In [16]:
predictions = {k:list() for k in FEATURES}

for i in range(len(test_x)):
    for ind,f in enumerate(FEATURES):
        value = test_x.iloc[i][ind]
        predictions[f].append(value)

In [17]:
model_prediction = classifier.predict(
    input_fn=lambda:eval_input_fn(predictions, None, 1))

In [18]:
predicted_classes = [p for p in model_prediction]

Entered into the loop because Label is None
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:My_Model_Fn: PREDICT, infer
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\MADHIV~1\AppData\Local\Temp\tmp9ppav3cz\model.ckpt-10563
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [19]:
CLASS = [0,1]
predicted_class = None

#iterate through the list to get the predictions and expected value
for ind, i in enumerate(predicted_classes):
    #type(i) => Dictionary
    probabilities = i['probabilites']
    
    if probabilities[0] > probabilities[1]:
        predicted_class = 0
    else:
        predicted_class = 1
    
    print("Expeceted:{}, Predicted:{}".format(predicted_class, test_y[ind]))

Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:0
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:0
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:0
Expeceted:1, Predicted:0
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:0
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1
Expeceted:1, Predicted:1


## Reason for Not Predicting Class 0 ##

Since the data is Unbalanced, Total Rows 339, out of 339 rows only 18 rows labeled as 0 which 18% of data. Since the model is trained in this low data, the model is not so good at predicting the label 0. To overcome this particular problem, we can use several method like
    1. Oversampling/Undersampling
    2. Populating more data for label 0
    3. Use K-Fold Cross Validation Method
    4. Get more data

## Trying Some Decision Tree Classifiers ##

In [21]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1000, max_depth=2, random_state=0)
clf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [22]:
print(clf.feature_importances_)

[6.24439293e-03 1.77015210e-02 2.74101260e-02 8.60526108e-03
 2.65353905e-02 1.46965984e-02 3.58506740e-02 2.08987678e-02
 8.51453369e-03 1.01746224e-02 2.39648139e-02 6.19670288e-02
 1.24881699e-02 1.83819942e-02 2.13309071e-02 1.67789670e-02
 1.18760066e-02 9.64939995e-03 4.57632035e-03 5.14327974e-03
 1.49941518e-02 1.30568677e-02 1.55982537e-02 1.98518692e-02
 1.36288176e-02 1.28029028e-02 1.58866329e-01 1.01054850e-02
 6.13662362e-03 1.46939099e-02 1.75909760e-02 7.62561865e-03
 4.46433238e-02 1.41530407e-02 2.31983210e-02 2.06695071e-02
 4.56574686e-03 1.06360815e-01 0.00000000e+00 0.00000000e+00
 1.59806125e-02 0.00000000e+00 0.00000000e+00 7.94780986e-03
 1.34028552e-04 2.33884800e-03 0.00000000e+00 9.02912658e-02
 0.00000000e+00 0.00000000e+00 1.94933532e-05 1.95660726e-03]


In [23]:
clf.predict(test_x)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

## Support Vector Machine ##

In [25]:
from sklearn import svm

model = svm.SVC(kernel='linear', C = 1.0)
model.fit(train_x, train_y)
model.score(train_x, train_y)

0.9822485207100592

In [30]:
predicted = model.predict(test_x)

for ind,i in enumerate(test_y):
    print("Expected:{}, predicted:{}".format(i, predicted[ind]))

Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:0, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:0, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:0, predicted:1
Expected:0, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:0, predicted:0
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:1, predicted:1
Expected:0, predicted:1


Seems the above model works well than previous model.

In [31]:
from sklearn.metrics import classification_report, confusion_matrix  

print(confusion_matrix(test_y,predicted))  
print(classification_report(test_y, predicted))  

[[ 1  5]
 [ 0 35]]
             precision    recall  f1-score   support

          0       1.00      0.17      0.29         6
          1       0.88      1.00      0.93        35

avg / total       0.89      0.88      0.84        41

