In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import linear_model
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [3]:
#data loading 
df = pd.read_csv('./data/ozone.csv')
display(df)

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,41.0,190.0,7.4,67,5,1
1,36.0,118.0,8.0,72,5,2
2,12.0,149.0,12.6,74,5,3
3,18.0,313.0,11.5,62,5,4
4,,,14.3,56,5,5
...,...,...,...,...,...,...
148,30.0,193.0,6.9,70,9,26
149,,145.0,13.2,77,9,27
150,14.0,191.0,14.3,75,9,28
151,18.0,131.0,8.0,76,9,29


### 전처리

In [4]:
# Month랑 Day 삭제
df.drop(['Month','Day'],axis=1,inplace=True)
display(df)

Unnamed: 0,Ozone,Solar.R,Wind,Temp
0,41.0,190.0,7.4,67
1,36.0,118.0,8.0,72
2,12.0,149.0,12.6,74
3,18.0,313.0,11.5,62
4,,,14.3,56
...,...,...,...,...
148,30.0,193.0,6.9,70
149,,145.0,13.2,77
150,14.0,191.0,14.3,75
151,18.0,131.0,8.0,76


In [5]:
# 결측치 확인
print(df.info())

# 결측치는 평균으로 채우기
df['Ozone'].fillna(df['Ozone'].mean(), inplace=True)
df['Solar.R'].fillna(df['Solar.R'].mean(), inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Ozone    116 non-null    float64
 1   Solar.R  146 non-null    float64
 2   Wind     153 non-null    float64
 3   Temp     153 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 4.9 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Ozone    153 non-null    float64
 1   Solar.R  153 non-null    float64
 2   Wind     153 non-null    float64
 3   Temp     153 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 4.9 KB
None


In [6]:
# 이상치 판단, 제거 z-score 사용
zsocre_threshold = 2.0

for col in df.columns:
    outlier = df[col][(np.abs(stats.zscore(df[col]))>zsocre_threshold)]
    df = df.loc[np.isin(df[col],outlier, invert=True)]

In [7]:
# 정규화
x_data = df.drop('Ozone',axis=1, inplace=False)
t_data = df['Ozone'].values.reshape(-1,1)

scaler_x = MinMaxScaler()
scaler_t = MinMaxScaler()
scaler_x.fit(x_data)
scaler_t.fit(t_data)

norm_x_data = scaler_x.transform(x_data)
norm_t_data = scaler_t.transform(t_data)

### 학습

In [10]:
# python 구현

# 다변수함수에 대한 수치미분 함수
def numerical_derivative(f,x):  # x = ndarray[1.0, 2.0]
                                
    delta_x = 1e-4
    derivative_x = np.zeros_like(x) # derivative_x = [0.0  0.0] 
    
    # iterator를 이용해서 입력변수 x에 대한 편미분 수행
    it = np.nditer(x,flags=['multi_index'])
    
    while not it.finished:
        idx = it.multi_index
        tmp = x[idx]              # tmp : 1.0
        
        x[idx] = tmp + delta_x    # x : ndarray[1.0001  2.0]
        fx_plus_delta = f(x)
        
        x[idx] = tmp - delta_x    # x : ndarray[0.9999  2.0]
        fx_minus_delta = f(x)
        
        derivative_x[idx] = (fx_plus_delta - fx_minus_delta) / (2 * delta_x)
        
        x[idx]  = tmp    # x : ndarray [1.0  2.0]
        it.iternext()
        
    return derivative_x

# loss function
def loss_func(input_data):
    input_W = input_data[:-1].reshape(-1,1)
    input_b = input_data[-1]
    
    y = np.dot(norm_x_data,input_W) + input_b
    
    return np.mean(np.power(norm_t_data-y,2))

# predict
def predict(x):
    y = np.dot(x,W) + b
    return y

# Weight, bias
W = np.random.rand(3,1)
b = np.random.rand(1)

# learning_rate
learning_rate = 1e-4

# 반복 학습
for step in range(300000):
    
    input_param = np.concatenate((W.ravel(), b.ravel()), axis=0)  # [W1, W2, W3, b]
    derivative_result = learning_rate * numerical_derivative(loss_func, input_param)

    W = W - derivative_result[:-1].reshape(-1,1)
    b = b - derivative_result[-1]
    
    if step % 30000 == 0:
        input_param = np.concatenate((W.ravel(), b.ravel()), axis=0)
        print('W : {}, b:{}, loss:{}'.format(W, b, loss_func(input_param)))

W : [[0.43969645]
 [0.73377177]
 [0.30820914]], b:[0.72400372], loss:1.271183430637271
W : [[0.12398712]
 [0.18788962]
 [0.21746446]], b:[0.11019669], loss:0.050736469382770294
W : [[0.14167898]
 [0.0191064 ]
 [0.35708238]], b:[0.10475256], loss:0.03433464784109406
W : [[ 0.1495438 ]
 [-0.08609012]
 [ 0.4455824 ]], b:[0.10245121], loss:0.027897945930579825
W : [[ 0.15216785]
 [-0.1516343 ]
 [ 0.5018276 ]], b:[0.10177305], loss:0.02536407474445196
W : [[ 0.15216497]
 [-0.1924506 ]
 [ 0.53768557]], b:[0.10187433], loss:0.02436255042218898
W : [[ 0.15098482]
 [-0.21784793]
 [ 0.56062993]], b:[0.10229716], loss:0.023964620036789053
W : [[ 0.14940181]
 [-0.23363272]
 [ 0.57537442]], b:[0.10280468], loss:0.02380545080279286
W : [[ 0.14780572]
 [-0.24342672]
 [ 0.58489725]], b:[0.1032838], loss:0.023741241836596563
W : [[ 0.14637265]
 [-0.24948891]
 [ 0.59108393]], b:[0.10368864], loss:0.023715063394454745


In [11]:
# predict
predict_data = np.array([[150.0,10.0,80.0]])
scaled_predict_data = scaler_x.transform(predict_data)
python_result = predict(scaled_predict_data)
python_result = scaler_t.inverse_transform(python_result)
print(python_result)

[[39.02454635]]


  "X does not have valid feature names, but"


In [15]:
# sklearn으로 구현
from sklearn import linear_model

model = linear_model.LinearRegression()

model.fit(x_data, t_data)

result_sklearn = model.predict(predict_data)
print(result_sklearn)   # [[39.10450845]] 

[[39.10450845]]


  "X does not have valid feature names, but"


In [13]:
# placeholder
# None = 신경쓰지 않겠다
X = tf.placeholder(shape=[None,3], dtype=tf.float32)
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# weight(3,1), bias
# variable = 값이 변할 수 있는 노드
W = tf.Variable(tf.random.normal([3,1]))
b = tf.Variable(tf.random.normal([1]))

# model(hypothesis, 가설, predict model)  # y = Wx + b => XW + b
H = tf.matmul(X,W) + b

# loss function
loss = tf.reduce_mean(tf.square(H-T))

# train node 생성
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# session을 생성하고 초기화를 진행
sess = tf.Session()
sess.run(tf.global_variables_initializer())   # 초기화 구문
# (그래프를 실행하기 전에 먼저 실행되어야 함)

# 반복 학습
for step in range(300000):
    
    _,W_val,b_val,loss_val = sess.run([train,W,b,loss], 
                                      feed_dict={X: norm_x_data,T:norm_t_data})
    if step % 30000 == 0:
        print('W : {}, b: {}, loss : {}'.format(W_val,b_val,loss_val))





W : [[ 0.9180232 ]
 [ 0.47122106]
 [-1.9974866 ]], b: [-0.9089018], loss : 2.9398200511932373
W : [[ 1.102994]
 [ 0.36686 ]
 [-0.902909]], b: [0.03711101], loss : 0.24165105819702148
W : [[ 0.8578098 ]
 [ 0.06505017]
 [-0.46726444]], b: [0.09261563], loss : 0.12536536157131195
W : [[ 0.6719932 ]
 [-0.11447648]
 [-0.17049058]], b: [0.12944983], loss : 0.07261908799409866
W : [[ 0.5324601 ]
 [-0.21878058]
 [ 0.03430719]], b: [0.15251344], loss : 0.04804619774222374
W : [[ 0.42843968]
 [-0.27711773]
 [ 0.17763862]], b: [0.16566962], loss : 0.03626604005694389
W : [[ 0.35135046]
 [-0.30763865]
 [ 0.27948105]], b: [0.17187673], loss : 0.03044351376593113
W : [[ 0.29450196]
 [-0.3215653 ]
 [ 0.35300645]], b: [0.17335394], loss : 0.027470184490084648
W : [[ 0.25275558]
 [-0.32580164]
 [ 0.4069693 ]], b: [0.17171879], loss : 0.025897754356265068
W : [[ 0.22218719]
 [-0.32459652]
 [ 0.4472312 ]], b: [0.16817307], loss : 0.025034405291080475


In [40]:
# prediction
result_tensorflow = sess.run(H,feed_dict={X:scaled_predict_data})
result = scaler_t.inverse_transform(result_tensorflow)
print(result)  # [[39.22307]]

[[39.22307]]
