In [20]:
import numpy as np
import csv
import time

np.random.seed(1234)
def randomize(): np,random.seed(time.time())

In [2]:
RND_MEAN = 0
RND_STD = 0.0030

LEARNING_RATE = 0.001

In [3]:
def abalone_exec(epoch_count=10, mb_size=10, report=1):
    load_abalone_dataset()
    init_model()
    train_and_test(epoch_count, mb_size, report)

In [4]:
def load_abalone_dataset():
    with open('data/chap01/abalone.csv') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader, None)
        rows = []
        for row in csvreader:
            rows.append(row)
        
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 10, 1
    data = np.zeros([len(rows), input_cnt+output_cnt])
    
    
    ## 원래있던 sex칼럼을 원핫 인코딩을 적용하여 3 칼럼으로 만들고 나머지 칼럼을 복사해온다.
    for n, row in enumerate(rows):
        if row[0] == 'I' : data[n, 0] = 1
        if row[0] == 'M' : data[n, 1] = 1
        if row[0] == 'F' : data[n, 2] = 1
        data[n, 3:] = row[1:]   

In [5]:
def init_model():
    global weight, bias, input_cnt, output_cnt
    weight = np.random.normal(RND_MEAN, RND_STD, [input_cnt, output_cnt])
    bias = np.zeros([output_cnt])

In [6]:
def train_and_test(epoch_count, mb_size, report):
    step_count = arrange_data(mb_size)
    test_x, test_y = get_test_data()
    
    for epoch in range(epoch_count):
        losses, accs = [], []
        
        for n in range(epoch_count):
            train_x, train_y = get_train_data(mb_size,n)
            loss, acc = run_train(train_x, train_y)
            losses.append(loss)
            accs.append(acc)
            
        if report > 0 and (epoch+1) % report == 0:
            acc = run_test(test_x, test_y)
            print('Epoch {}: loss={:5.3f}, accuracy={:5.3f}/{:5.3f}'. \
                   format(epoch+1, np.mean(losses), np.mean(accs), acc))
            
    final_acc = run_test(test_x, test_y)
    print('\nFinal Test: final accuracy = {:5.3f}'.format(final_acc))
    

In [7]:
def arrange_data(mb_size):
    global data, shuffle_map, test_begin_idx
    shuffle_map = np.arange(data.shape[0])
    np.random.shuffle(shuffle_map)
    step_count = int(data.shape[0] * 0.8) // mb_size
    test_begin_idx = step_count * mb_size
    return step_count


def get_test_data():
    global data, shuffle_map, test_begin_idx, output_cnt
    test_data = data[shuffle_map[test_begin_idx:]]
    return test_data[:, :-output_cnt], test_data[:, -output_cnt:]


def get_train_data(mb_size, nth):
    global data, shuffle_map, test_begin_idx, output_cnt
    if nth == 0 :
        np.random.shuffle(shuffle_map[:test_begin_idx])
    train_data = data[shuffle_map[mb_size*nth:mb_size*(nth+1)]]
    return train_data[:, :-output_cnt], train_data[:, -output_cnt:]


In [8]:
def run_train(x, y):
    output, aux_nn = forward_neuralnet(x)
    loss, aux_pp = forward_postproc(output, y)
    accuracy = eval_accuracy(output, y)
    
    G_loss = 1.0
    G_output = backprop_postproc(G_loss, aux_pp)
    backprop_neuralnet(G_output, aux_nn)
    
    return loss, accuracy


def run_test(x, y):
    output, _ = forward_neuralnet(x)
    accuracy = eval_accuracy(output, y)
    return accuracy



In [9]:
def forward_neuralnet(x):
    global weight, bias
    output = np.matmul(x, weight) + bias
    return output, x


def backprop_neuralnet(G_output, x):
    global weight, bias
    g_output_w = x.transpose()
    
    G_w = np.matmul(g_output_w, G_output)
    G_b = np.sum(G_output, axis = 0)
    
    weight -= LEARNING_RATE * G_w
    bias -= LEARNING_RATE * G_b

In [10]:
def forward_postproc(output, y):
    diff = output - y
    square = np.square(diff)
    loss = np.mean(square)
    return loss, diff

def backprop_postproc(G_loss, diff):
    shape = diff.shape
    
    g_loss_square = np.ones(shape) / np.prod(shape)
    g_square_diff = 2 * diff
    g_diff_output = 1
    
    G_square = g_loss_square * G_loss
    G_diff = g_square_diff * G_square
    G_output = g_diff_output * G_diff
    
    return G_output

In [11]:
def backprop_postproc_oneline(G_loss, diff):
    return 2*diff / np.prod(diff.shape)

In [12]:
def eval_accuracy(output, y):
    mdiff = np.mean(np.abs((output -y)/y))
    return 1 - mdiff

In [13]:
abalone_exec()

Epoch 1: loss=103.634, accuracy=0.025/0.058
Epoch 2: loss=93.324, accuracy=0.084/0.112
Epoch 3: loss=74.436, accuracy=0.135/0.160
Epoch 4: loss=92.138, accuracy=0.174/0.212
Epoch 5: loss=61.683, accuracy=0.234/0.255
Epoch 6: loss=63.377, accuracy=0.278/0.299
Epoch 7: loss=53.380, accuracy=0.318/0.338
Epoch 8: loss=48.214, accuracy=0.363/0.377
Epoch 9: loss=46.449, accuracy=0.388/0.414
Epoch 10: loss=39.427, accuracy=0.427/0.448

Final Test: final accuracy = 0.448


In [14]:
def pulsar_exec(epoch_count=10, mb_size=10, report=1):
    load_pulsar_dataset()
    init_model()
    train_and_test(epoch_count, mb_size, report)

In [24]:
def load_pulsar_dataset():
    '''with('data/chap02/pulsar_stars.csv') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader, None)
        rows = []
        for row in csvreader:
            rows.append(row)
    '''
            
    import pandas as pd
    df = pd.read_csv('data/chap02/pulsar_stars.csv')
    rows = np.asarray(df)
            
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 8, 1
    data = np.asarray(rows, dtype='float32')
        

In [16]:
def forward_postproc(output, y):
    entropy = sigmoid_cross_entropy_with_logits(y, output)
    loss = np.mean(entropy)
    return loss, [y, output, entropy]


def backprop_postproc(G_loss, aux):
    y, output, entropy = aux
    
    g_loss_entropy = 1.0 / np.prod(entropy.shape)
    g_entropy_output = sigmoid_cross_entropy_with_logits_derv(y, output)
    
    G_entropy = g_loss_entropy * G_loss
    G_output = g_entropy_output * G_entropy
    
    return G_output

In [17]:
def eval_accuracy(output, y):
    estimate = np.greater(output, 0)
    answer = np.greater(y, 0.5)
    correct = np.equal(estimate, answer)
    
    return np.mean(correct)

In [18]:
def relu(x):
    return np.maximum(x, 0)

def sigmoid(x):
    return np.exp(-relu(-x)) / (1.0 + np.exp(-np.abs(x)))

def sigmoid_derv(x, y):
    return y * (1-y)

def sigmoid_cross_entropy_with_logits(z, x):
    return relu(x) - x * z + np.log(1 + np.exp(-np.abs(x)))

def sigmoid_cross_entropy_with_logits_derv(z, x):
    return -z + sigmoid(x)


In [25]:
pulsar_exec()

Epoch 1: loss=0.233, accuracy=0.910/0.934
Epoch 2: loss=0.076, accuracy=0.970/0.940
Epoch 3: loss=0.284, accuracy=0.900/0.958
Epoch 4: loss=0.089, accuracy=0.970/0.957
Epoch 5: loss=0.017, accuracy=1.000/0.950
Epoch 6: loss=0.495, accuracy=0.870/0.955
Epoch 7: loss=0.175, accuracy=0.930/0.958
Epoch 8: loss=0.165, accuracy=0.940/0.961
Epoch 9: loss=0.112, accuracy=0.960/0.947
Epoch 10: loss=0.274, accuracy=0.910/0.944

Final Test: final accuracy = 0.944


In [26]:
abalone_exec()

Epoch 1: loss=-0.302, accuracy=0.990/1.000
Epoch 2: loss=-2.904, accuracy=1.000/1.000
Epoch 3: loss=-5.748, accuracy=1.000/1.000
Epoch 4: loss=-8.520, accuracy=1.000/1.000
Epoch 5: loss=-10.511, accuracy=1.000/1.000
Epoch 6: loss=-13.333, accuracy=1.000/1.000
Epoch 7: loss=-15.307, accuracy=1.000/1.000
Epoch 8: loss=-17.340, accuracy=1.000/1.000
Epoch 9: loss=-20.622, accuracy=1.000/1.000
Epoch 10: loss=-21.838, accuracy=1.000/1.000

Final Test: final accuracy = 1.000


In [27]:
def pulsar_exec(epoch_count=10, mb_size=10, report=1, adjust_ratio=False):
    load_pulsar_dataset(adjust_ratio)
    init_model()
    train_and_test(epoch_count, mb_size, report)

In [46]:
def load_pulsar_dataset(adjust_ratio):
    pulsars, stars = [], []
    
    import pandas as pd
    df = pd.read_csv('data/chap02/pulsar_stars.csv')
    pulsars = np.asarray(df[df['target_class'] == 1])
    stars = np.asarray(df[df['target_class'] == 0])
    
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 8, 1
    
    star_cnt, pulsar_cnt = len(stars), len(pulsars)
    if adjust_ratio:
        data = np.zeros([2*star_cnt, 9])
        data[0:star_cnt, :] = np.asarray(stars, dtype='float32')
        for n in range(star_cnt):
            data[star_cnt+n] = np.asarray(pulsars[n % pulsar_cnt], dtype='float32')
    else:
        data = np.zeros([star_cnt+pulsar_cnt, 9])
        data[0:star_cnt, : ] = np.asarray(stars, dtype='float32')
        data[star_cnt:, :] = np.asarray(pulsars, dtype='float32')

In [38]:
def eval_accuracy(output, y):
    est_yes = np.greater(output, 0)
    ans_yes = np.greater(y, 0.5)
    est_no = np.logical_not(est_yes)
    ans_no = np.logical_not(ans_yes)
    
    tp = np.sum(np.logical_and(est_yes, ans_yes))
    fp = np.sum(np.logical_and(est_yes, ans_no))
    fn = np.sum(np.logical_and(est_no, ans_yes))
    tn = np.sum(np.logical_and(est_no, ans_no))
    
    accuracy = safe_div(tp+tn, tp+tn+fp+fn)
    precision = safe_div(tp, tp+fp)
    recall = safe_div(tp, tp+fn)
    f1 = 2 * safe_div(recall*precision, recall+precision)
    
    return [accuracy, precision, recall, f1]


def safe_div(p, q):
    p, q = float(p), float(q)
    if np.abs(q) < 1.0e-20: return np.sign(p)
    return p / q

In [39]:
def train_and_test(epoch_count, mb_size, report):
    step_count = arrange_data(mb_size)
    test_x, test_y = get_test_data()
    
    for epoch in range(epoch_count):
        losses = []
        
        for n in range(step_count):
            train_x, train_y = get_train_data(mb_size, n)
            loss, _ = run_train(train_x, train_y)
            losses.append(loss)
            
        if report > 0 and (epoch+1) % report == 0:
            acc = run_test(test_x, test_y)
            acc_str = ','.join(['%5.3f']*4) % tuple(acc)
            print('Epoch {}: loss={:5.3f}, result={}'.format(epoch+1, np.mean(losses), acc_str))
            
    acc = run_test(test_x, test_y)
    acc_str= ','.join(['%5.3f']*4) % tuple(acc)
    print('\nFinal test: final result = {}'.format(acc_str))

In [42]:
pulsar_exec()

Epoch 1: loss=0.143, result=0.971,0.944,0.727,0.821
Epoch 2: loss=0.134, result=0.975,0.893,0.819,0.854
Epoch 3: loss=0.133, result=0.969,0.827,0.837,0.832
Epoch 4: loss=0.131, result=0.975,0.940,0.770,0.847
Epoch 5: loss=0.137, result=0.975,0.937,0.776,0.849
Epoch 6: loss=0.131, result=0.973,0.967,0.727,0.830
Epoch 7: loss=0.127, result=0.970,0.974,0.690,0.808
Epoch 8: loss=0.132, result=0.975,0.958,0.761,0.848
Epoch 9: loss=0.130, result=0.974,0.909,0.798,0.850
Epoch 10: loss=0.118, result=0.970,0.824,0.847,0.835

Final test: final result = 0.970,0.824,0.847,0.835


In [47]:
pulsar_exec(adjust_ratio=True)

Epoch 1: loss=0.430, result=0.916,0.954,0.876,0.913
Epoch 2: loss=0.376, result=0.923,0.955,0.889,0.921
Epoch 3: loss=0.374, result=0.847,0.788,0.953,0.863
Epoch 4: loss=0.383, result=0.917,0.921,0.914,0.918
Epoch 5: loss=0.367, result=0.868,0.818,0.952,0.880
Epoch 6: loss=0.355, result=0.913,0.983,0.843,0.908
Epoch 7: loss=0.376, result=0.919,0.965,0.872,0.916
Epoch 8: loss=0.369, result=0.907,0.981,0.831,0.900
Epoch 9: loss=0.363, result=0.897,0.881,0.919,0.900
Epoch 10: loss=0.375, result=0.909,0.951,0.865,0.906

Final test: final result = 0.909,0.951,0.865,0.906
