In [13]:
import math
import os
import random
import shutil

import tensorflow as tf
import numpy as np

In [2]:
# Graph config
FEATURE_NUM = 4

# Training config
LEARNING_RATE = 0.001
EPOCHES = 10
BATCH_SIZE = 1
DISPLAY_STEP = 100

# output dir
CHKPT_DIR = '/tmp/chkpt/lr'
LOG_DIR = '/tmp/log/lr'
MODEL_DIR = '/tmp/model/lr'

In [3]:
# sample data

# iris data
from sklearn import datasets
dataset_ori = datasets.load_iris(return_X_y=True)
y_label = map(lambda x: x == 0, dataset_ori[1])
dataset = []
dataset.append(dataset_ori[0])
dataset.append(np.array(list(y_label)).astype(int))

# mock data
def mock_boundary_func(X):
    # mock weights is (1, 2, 3, ..)
    return np.sum(np.dot(X, list(range(1, len(X) + 1))))
dataset = []
dataset.append(np.random.standard_normal(size=(1000, FEATURE_NUM)))
dataset.append(np.array(list(map(lambda x: mock_boundary_func(x) >= 0, dataset[0]))).astype(int))

In [4]:
# batch util
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [5]:
# test batch
for tu in batch(list(zip(dataset[0], dataset[1])), n=3):
    X, y = zip(*tu)
    print(X, y)
    break

(array([-0.66648262,  2.01710989,  1.26082899,  0.04385765]), array([-0.60630516, -0.66185479, -0.46579354, -0.03129897]), array([-0.11115686,  0.3770051 ,  0.04170284,  0.18289504])) (1, 0, 1)


In [6]:
class LogisticRegressor(object):
    """
        Logistic Regressor
    """
    def __init__(self, feature_num, learning_rate=1e-2, random_seed=None):
        """
            Initializer
            Params:
                feature_num: feature number
                learning_rate: learning rate
                random_seed: random seed
        """
        self.feature_num = feature_num
        self.learning_rate = learning_rate
        self.random_seed = random_seed
        self.construct_placeholders()
        
    def construct_placeholders(self):
        """
            Construct inpute placeholders
        """
        self.input_feature_vectors = tf.placeholder(shape=[None, self.feature_num],
            dtype=tf.float32)
        self.input_labels = tf.placeholder(shape=[None, 1],
            dtype=tf.float32)
    
    def build_graph(self):
        """
            Build graph
        """
        self.construct_weights()
        
        # network forward pass
        saver, logits = self.forward_pass()
        
        # loss function
        loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=self.input_labels))
        
        # training optimizer
        train_op = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)
        
        # statistics
        tf.summary.scalar('loss', loss)
        stat_merged = tf.summary.merge_all()

        return saver, logits, loss, train_op, stat_merged
    
    def construct_weights(self):
        """
            Construct weights
        """
        self.weights = []
        self.biases = []
        
        for i in range(1):
            weight_key = "w_{}_{}".format(i, i+1)
            bias_key = "b_{}".format(i)
            self.weights.append(tf.get_variable(
                name=weight_key, shape=[self.feature_num, 1],
                initializer=tf.contrib.layers.xavier_initializer(
                    seed=self.random_seed)))
            
            self.biases.append(tf.get_variable(
                name=bias_key, shape=[1],
                initializer=tf.truncated_normal_initializer(
                    stddev=0.001, seed=self.random_seed)))
            
            # statistics
            tf.summary.histogram(weight_key, self.weights[-1])
            tf.summary.histogram(bias_key, self.biases[-1])
            
    def forward_pass(self):
        """
            Forward pass
        """
        h = self.input_feature_vectors
        
        for i, (w, b) in enumerate(zip(self.weights, self.biases)):
            h = tf.matmul(h, w) + b
            
            if i != len(self.weights) - 1:
                h = tf.nn.sigmoid(h)

        return tf.train.Saver(), h

In [7]:
# metrics
from sklearn.metrics import roc_auc_score

def sigmoid(x):
    """
        Sigmoid
    """
    return 1 / (1 + np.exp(-x))

def calc_accuracy(logits, labels):
    """
        Calc accuracy
    """
    pred_labels = np.round(sigmoid(logits))
    match_score = np.equal(pred_labels, labels).astype(np.float32)
    return np.mean(match_score)

def calc_f1(logits, labels, log_confusion_matrix=False):
    """
        Calc F1 score
    """
    pred_labels = np.round(sigmoid(np.array(logits))).ravel()
    real_labels = np.array(labels).ravel()
    ind_1 = np.argwhere(real_labels == 1)
    ind_0 = np.argwhere(real_labels == 0)
    tp = np.sum(pred_labels[ind_1])
    tn = np.sum((1 - pred_labels)[ind_0])
    fp = np.sum(pred_labels[ind_0])
    fn = np.sum((1 - pred_labels)[ind_1])
    f1 = 2.0 * tp / (2*tp + fn + fp)
    acc = (tp + tn) * 1.0 / (tp + tn + fp + fn)
    if log_confusion_matrix is True:
        print("tp:{}, tn:{}, fp:{}, fn:{}, acc:{}, len:{}".format(
        tp, tn, fp, fn, acc, len(logits)))
    return f1

def calc_auc(logits, labels):
    """
        Calc AUC
    """
    return roc_auc_score(labels, logits)

In [8]:
a = [1, 1, 1, 0]
b = [0, 1, 1, 0]
calc_f1(a, b, log_confusion_matrix=True)

tp:2.0, tn:1.0, fp:1.0, fn:0.0, acc:0.75, len:4


0.8

In [9]:
# build graph
tf.reset_default_graph()
lr = LogisticRegressor(feature_num=FEATURE_NUM, learning_rate=LEARNING_RATE, random_seed=None)
saver, logits, loss, train_op, stat_merged = lr.build_graph()

In [10]:
# training
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    
    # log dir
    log_dir = LOG_DIR
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
    summary_writer = tf.summary.FileWriter(log_dir, graph=tf.get_default_graph())
    
    # checkpoint dir
    chkpt_dir = CHKPT_DIR
    if os.path.exists(chkpt_dir):
        shutil.rmtree(chkpt_dir)
    if not os.path.isdir(chkpt_dir):
        os.makedirs(chkpt_dir)
    
    for epoch in range(EPOCHES):
        batch_cnt = 0
        batches_per_epoch = math.floor((len(dataset[0]) - 1) * 1.0 / BATCH_SIZE) + 1
        best_loss = np.inf
        cur_loss = np.inf
        cur_accuracy = 0
        training_data = list(zip(dataset[0], dataset[1]))
        random.shuffle(training_data)
        for tu in batch(training_data, n=BATCH_SIZE):
            X, y = zip(*tu)
            y = np.expand_dims(y, 1)
            feed_dict = {
                lr.input_feature_vectors: X,
                lr.input_labels: y
            }
            sess.run(train_op, feed_dict=feed_dict)
            batch_cnt += 1
            global_step = epoch * batches_per_epoch + batch_cnt
            if global_step % DISPLAY_STEP == 0:
                in_f = dataset[0]
                in_l = np.expand_dims(dataset[1], 1)
                feed_dict = {
                    lr.input_feature_vectors: in_f,
                    lr.input_labels: in_l
                }
                cur_loss, cur_logits = sess.run([loss, logits], feed_dict=feed_dict)
                summary_train = sess.run(stat_merged, feed_dict=feed_dict)
                summary_writer.add_summary(summary_train, global_step=global_step)
                print("epoch: {}, global_step: {}, loss: {}, "
                    "accuracy: {}, f1: {}, auc: {}".format(
                    epoch, global_step, cur_loss,
                    calc_accuracy(cur_logits, in_l),
                    calc_f1(cur_logits, in_l),
                    calc_auc(cur_logits, in_l)))
        if cur_loss < best_loss:
            best_loss = cur_loss
            saver.save(sess, '{}/model'.format(chkpt_dir))

epoch: 0, global_step: 100, loss: 0.520747721195221, accuracy: 0.75, f1: 0.7524752475247525, auc: 0.8207652503811844
epoch: 0, global_step: 200, loss: 0.513968288898468, accuracy: 0.7540000081062317, f1: 0.7569169960474308, auc: 0.8271643475442114
epoch: 0, global_step: 300, loss: 0.5047698616981506, accuracy: 0.7620000243186951, f1: 0.7652859960552268, auc: 0.8366649458337835
epoch: 0, global_step: 400, loss: 0.4962655305862427, accuracy: 0.7720000147819519, f1: 0.776908023483366, auc: 0.845137046330424
epoch: 0, global_step: 500, loss: 0.4899859130382538, accuracy: 0.7749999761581421, f1: 0.7800586510263929, auc: 0.8509998839438287
epoch: 0, global_step: 600, loss: 0.4835447371006012, accuracy: 0.7820000052452087, f1: 0.786692759295499, auc: 0.8569627699806708
epoch: 0, global_step: 700, loss: 0.47795069217681885, accuracy: 0.7839999794960022, f1: 0.7890625, auc: 0.8623293674138284
epoch: 0, global_step: 800, loss: 0.4717045724391937, accuracy: 0.7879999876022339, f1: 0.79296875, auc

epoch: 6, global_step: 6700, loss: 0.2961331009864807, accuracy: 0.9380000233650208, f1: 0.939453125, auc: 0.9898310782418691
epoch: 6, global_step: 6800, loss: 0.2949445843696594, accuracy: 0.9390000104904175, f1: 0.9404878048780487, auc: 0.9901192177013675
epoch: 6, global_step: 6900, loss: 0.29371726512908936, accuracy: 0.9390000104904175, f1: 0.9404878048780487, auc: 0.9904473765302406
epoch: 6, global_step: 7000, loss: 0.2925560176372528, accuracy: 0.9399999976158142, f1: 0.94140625, auc: 0.9906754869356769
epoch: 7, global_step: 7100, loss: 0.2912064790725708, accuracy: 0.9409999847412109, f1: 0.9423264907135875, auc: 0.9910876864402371
epoch: 7, global_step: 7200, loss: 0.2900957763195038, accuracy: 0.9440000057220459, f1: 0.9450980392156862, auc: 0.9913278026564858
epoch: 7, global_step: 7300, loss: 0.2888176739215851, accuracy: 0.9449999928474426, f1: 0.9460255152109912, auc: 0.9917680157196083
epoch: 7, global_step: 7400, loss: 0.28758350014686584, accuracy: 0.945999979972839

In [11]:
# predict using checkpoint
tf.reset_default_graph()
lr = LogisticRegressor(feature_num=FEATURE_NUM, learning_rate=LEARNING_RATE, random_seed=None)
saver, logits, loss, train_op, stat_merged = lr.build_graph()
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    saver.restore(sess, '{}/model'.format(chkpt_dir))
    
    in_feature_vecs = dataset[0][:100]
    in_labels = dataset[1][:100]
    in_labels = np.expand_dims(in_labels, 1)
    feed_dict = {
        lr.input_feature_vectors: in_feature_vecs,
        lr.input_labels: in_labels
    }
    out_logits, out_weights, out_biases = sess.run(
        [logits, lr.weights, lr.biases], feed_dict=feed_dict)
    print("accuracy: {}, f1: {}, auc: {}".format(
        calc_accuracy(cur_logits, in_l),
        calc_f1(cur_logits, in_l, log_confusion_matrix=True),
        calc_auc(cur_logits, in_l)))
    print("weights: ", out_weights)
    print("biases: ", out_biases)

INFO:tensorflow:Restoring parameters from /tmp/chkpt/lr/model
tp:494.0, tn:473.0, fp:16.0, fn:17.0, acc:0.967, len:1000
accuracy: 0.9670000076293945, f1: 0.9676787463271302, auc: 0.9964742935580821
weights:  [array([[0.5112941 ],
       [0.66312635],
       [1.1420636 ],
       [1.6672636 ]], dtype=float32)]
biases:  [array([0.01430484], dtype=float32)]


In [12]:
# tensorboard
# tensorboard --logdir /tmp/log/lr --port 8008