In [10]:
# Multi Variable Logistic Regression
# 내 성적 [600, 3.8, 1.]으로 진학가능할지 여부
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import linear_model
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

# 수치 미분함수
from my_library.machine_learning_library import numerical_derivative

# Raw Data Loading
df = pd.read_csv('./data/admission.csv')
# Data Preprocessing
# 결측치 확인
# print(df.isnull().sum()) 결측치 존재 x

# 만약 있었으면 결측치 제거
# df = df.dropna(how='any')

# 이상치 처리
# Z-score 이용
zscore_threshold = 2.0

for col in df.columns:
    outliers = df[col][np.abs(stats.zscore(df[col])) > zscore_threshold]
    df = df.loc[~df[col].isin(outliers)]

# 정규화
# MinMaxScaler 이용
# label은 0,1 이므로 변수들만 정규화
x_data = df.drop('admit', axis=1, inplace=False).values
t_data = df['admit'].values.reshape(-1,1)

scaler_x = MinMaxScaler()  # MinMaxScaler 클래스 객체 생성
scaler_x.fit(x_data)
norm_x_data = scaler_x.transform(x_data)  # (382, 3)

# 예측에 쓸 내 성적 => 정규화
my_score = np.array([[600, 3.8, 1.]])
scaler_x.fit(my_score)
norm_my_score = scaler_x.transform(my_score)

In [8]:
# 1. sklearn 구현

# Logistic Regression Model 생성
model = linear_model.LogisticRegression()

# Traning Data Set을 이용한 학습
model.fit(x_data,t_data.ravel())

# Prediction
my_score = np.array([[600, 3.8, 1.]])
pred_val = model.predict(my_score)
pred_proba = model.predict_proba(my_score)
print('sklearn 결과값')
print('예측결과 : {}, 불합격할 확률 : {}, 합격할 확률 : {}'.format(pred_val[0],pred_proba[0,0],pred_proba[0,1]))

sklearn 결과값
예측결과 : 1, 불합격할 확률 : 0.43740782354334207, 합격할 확률 : 0.5625921764566579


In [14]:
# 2. python 구현

# Weight & bias
W = np.random.rand(3,1)
b = np.random.rand(1)

# Loss Function
def loss_func(input_obj):
    # input_obj : W와 b를 같이 포함하고 있는 ndarray => [W1,W2,W3, ... , b]
    num_of_bias = b.shape[0]
    input_W = input_obj[:-1 * num_of_bias].reshape(-1,num_of_bias)  # 행렬연산을 하기 위한 W 추출
    input_b = input_obj[-1 * num_of_bias]
    
    # 우리 모델의 예측값 : Linear Regression model(Wx + b) ==> sigmoid 적용
    z = np.dot(norm_x_data,input_W) + input_b
    y = 1 / (1 + np.exp(-1*z))
    
    delta = 1e-7 #  0에 가까운 작은 값을 줌으로써 프로그램의 로그 연산시 무한대로 발산하는 것을 방지
    
    # Cross Entropy
    return -np.sum(t_data * np.log(y+delta) + ((1-t_data) * np.log(1-y+delta)))
    
# Learning rate
learning_rate = 1e-4

# 학습
for step in range(30000):
    
    input_param = np.concatenate((W.ravel(), b.ravel()), axis = 0)
    derivative_result = learning_rate * numerical_derivative(loss_func, input_param)
    
    
    num_of_bias = b.shape[0]
    
    W = W - derivative_result[:-1 * num_of_bias].reshape(-1,num_of_bias)
    b = b - derivative_result[-1 * num_of_bias]
    
        
# Prediction => W,b를 구해서 Logistic Regression Model을 완성
def logistic_predict(x):
    
    z = np.dot(x,W) + b
    y = 1 / (1 + np.exp(-1 * z))
    
    if y < 0.5 :
        result = 0
    else :
        result = 1
    
    return result, y

result = logistic_predict(norm_my_score)
print('python 결과값')
print('예측결과 : {}, 합격할 확률 : {}'.format(result[0],result[1][0,0]))

python 결과값
예측결과 : 0, 합격할 확률 : 0.22215945369769727


In [19]:
# 3. tensorflow 구현

# Placeholder 생성
X = tf.placeholder(shape=[None,3], dtype=tf.float32)  # 독립변수가 1개인 경우(simple), shape 명시 x
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight & bias
W = tf.Variable(tf.random.normal([3,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='weight')

# Hypothesis
logit = tf.matmul(X,W) + b  # Linear Regression Hypothesis
H = tf.sigmoid(logit)

# Loss Function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

# Train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-1).minimize(loss)

# Session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 학습
for step in range(30000):
    _, W_val, b_val, loss_val = sess.run([train,W,b,loss], feed_dict={X : norm_x_data, T : t_data})
    
    if step % 3000 == 0 :
        print('W : {}, b : {}, loss : {}'.format(W_val, b_val, loss_val))
    
# Prediction
result = sess.run(H, feed_dict={X : norm_my_score})
print('tensorflow 결과값')
print('합격할 확률 : {}'.format(result))

W : [[ 1.1061417]
 [-1.4273796]
 [-1.0558444]], b : [-0.49062976], loss : 0.6492480635643005
W : [[ 1.1149737 ]
 [ 0.99957657]
 [-1.6384919 ]], b : [-1.1837913], loss : 0.5792050361633301
W : [[ 1.0780456]
 [ 1.1205627]
 [-1.6168962]], b : [-1.2483236], loss : 0.5790991187095642
W : [[ 1.0754619]
 [ 1.1288074]
 [-1.6152211]], b : [-1.2527866], loss : 0.5790985822677612
W : [[ 1.0753133]
 [ 1.1293391]
 [-1.6151268]], b : [-1.2530779], loss : 0.5790985822677612
W : [[ 1.0753133]
 [ 1.1293391]
 [-1.6151268]], b : [-1.2530779], loss : 0.5790985822677612
W : [[ 1.0753133]
 [ 1.1293391]
 [-1.6151268]], b : [-1.2530779], loss : 0.5790985822677612
W : [[ 1.0753133]
 [ 1.1293391]
 [-1.6151268]], b : [-1.2530779], loss : 0.5790985822677612
W : [[ 1.0753133]
 [ 1.1293391]
 [-1.6151268]], b : [-1.2530779], loss : 0.5790985822677612
W : [[ 1.0753133]
 [ 1.1293391]
 [-1.6151268]], b : [-1.2530779], loss : 0.5790985822677612
tensorflow 결과값
합격할 확률 : [[0.22216779]]
