In [1]:
#https://github.com/backstopmedia/tensorflowbook/blob/master/chapters/04_machine_learning_basics/
#logistic_regression.py
import os
import time
import tensorflow as tf
import numpy as np

In [2]:
W = tf.Variable(tf.zeros([5,1]),name='weight')
b = tf.Variable(0.,name="bias")

In [3]:
def read_csv(batch_size,file_name,record_defaults):
    filename_queue = tf.train.string_input_producer([os.path.join(os.getcwd(), file_name)])

    reader = tf.TextLineReader(skip_header_lines=1)
    key, value = reader.read(filename_queue)

    # decode_csv will convert a Tensor from type string (the text line) in
    # a tuple of tensor columns with the specified defaults, which also
    # sets the data type for each column
    decoded = tf.decode_csv(value, record_defaults=record_defaults)

    # batch actually reads the file and loads "batch_size" rows in a single tensor
    # capacity?
    # min_after_dequeue?
    return tf.train.shuffle_batch(decoded,
                                  batch_size=batch_size,
                                  capacity=batch_size * 50,
                                  min_after_dequeue=batch_size) 

def inputs():
    """读取训练数据"""
    #https://www.kaggle.com/c/titanic/data
    #泰坦尼克号，x=[乘客年龄,性别,船票等级],y=[是否能幸存]
    passenger_id, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked = \
        read_csv(100, "data/titanic_train.csv", [[0.0], [0.0], [0], [""], [""], [0.0], [0.0], [0.0], [""], [0.0], [""], [""]])

    # 船票等级，
    # 如果使用数字1,2,3分别代表船票等级,会引入不存在的数值关系
    # 正确做法,将每个属性特征扩展为N维布尔特征值
    is_first_class = tf.to_float(tf.equal(pclass, [1]))
    is_second_class = tf.to_float(tf.equal(pclass, [2]))
    is_third_class = tf.to_float(tf.equal(pclass, [3]))
    
    # 性别
    # 对于二值属性，单bool值变量足够
    # equal 判断是否相等，返回bool值
    gender = tf.to_float(tf.equal(sex, ["female"]))

    # Finally we pack all the features in a single matrix;
    # We then transpose to have a matrix with one example per row and one feature per column.
    # stack 将所有值打包至单个张量
    features = tf.transpose(tf.stack([is_first_class, is_second_class, is_third_class, gender, age]))
    survived = tf.reshape(survived, [100, 1])
    
    # AttributeError: module 'tensorflow' has no attribute 'pack'
    # pack -> stack
    
    return features, survived

In [4]:
def combine_inputs(X):
    return tf.matmul(X,W) + b

def inference(X):
    """推断模型"""
    # sigmoid f(x)=1/(1+e**(-x)) sigmoid函数的输出是一个概率值
    return tf.sigmoid(combine_inputs(X)) 

def loss(X,Y):
    """交叉熵(cross entropy) 损失函数
    sum( y[i]*log(y_predicted[i]) + (1-y[i]*log(1-y_predicted[i])) ) """
    return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=combine_inputs(X), labels=Y))

    # ValueError: Only call `sigmoid_cross_entropy_with_logits` with named arguments (labels=..., logits=..., ...) 
    # return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( combine_inputs(X),Y ))

def train(total_loss):
    """依据 计算的总损失 训练或调整 模型参数"""
    learning_rate = 0.005
    return tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

def evaluate(sess,X,Y):
    """对训练得到的模型进行评估"""
    # cast 类型转换
    # 只是训练集上的准确率，but测试集的泛化能力？
    predicted = tf.cast(inference(X)>0.5,tf.float32)
    print( sess.run( tf.reduce_mean( tf.cast( tf.equal(predicted,Y),tf.float32)) ) )
    

In [6]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()

    X, Y = inputs()

    total_loss = loss(X, Y)
    train_op = train(total_loss)

    coord = tf.train.Coordinator() #??
    threads = tf.train.start_queue_runners(sess=sess, coord=coord) #?

    # actual training loop
    training_steps = 1000
    for step in range(training_steps):
        sess.run([train_op])
        # for debugging and learning purposes, see how the loss gets decremented thru training steps
        if step % 10 == 0: 
            print("step:%s, loss:%s " %(step, sess.run([total_loss]))) 
            

    evaluate(sess, X, Y)
    print("step:%s, W:%s, b:%s,loss:%s " %(step,sess.run(W),sess.run(b), sess.run([total_loss])))
    
    time.sleep(5) #why ?

    coord.request_stop()
    coord.join(threads)
    
    #tf.summary. 收敛情况可视化？ 
    
    sess.close() 
    

step:0, loss:[0.63010824] 
step:10, loss:[0.74033] 
step:20, loss:[0.6862436] 
step:30, loss:[0.6828994] 
step:40, loss:[0.67126167] 
step:50, loss:[0.66781974] 
step:60, loss:[0.7018417] 
step:70, loss:[0.63929534] 
step:80, loss:[0.63218164] 
step:90, loss:[0.7184467] 
step:100, loss:[0.66789865] 
step:110, loss:[0.6694786] 
step:120, loss:[0.6614162] 
step:130, loss:[0.66490847] 
step:140, loss:[0.68370664] 
step:150, loss:[0.65805435] 
step:160, loss:[0.66177046] 
step:170, loss:[0.6167009] 
step:180, loss:[0.6815063] 
step:190, loss:[0.6773369] 
step:200, loss:[0.6200832] 
step:210, loss:[0.6124092] 
step:220, loss:[0.6612683] 
step:230, loss:[0.65602416] 
step:240, loss:[0.6430075] 
step:250, loss:[0.6241507] 
step:260, loss:[0.6666737] 
step:270, loss:[0.662122] 
step:280, loss:[0.6487184] 
step:290, loss:[0.61942965] 
step:300, loss:[0.6921735] 
step:310, loss:[0.6768935] 
step:320, loss:[0.69249094] 
step:330, loss:[0.6278185] 
step:340, loss:[0.6031287] 
step:350, loss:[0.632