# 머신러닝 알고리즘 분류
* 회귀 분석(지도학습)
    * Linear Regression

* 분류 분석(지도학습)
    * Logistic Regression
    * K-Nearest Neighbor
    * Decision Tree
    * Random Forest
    * Naive Bayes Classifier
    * SVM

* 군집분석(clustering)(비지도 학습)
    * K-Means Clustering
    * Hierarchical Clustering
    * Density Clustering

## 알고리즘 설명
* 선형회귀모델은 2차원 상의 일직선이므로 다음과 같은 다항함수로 표현할 수 있다.
####  y = wx + b

* 모델에 특정한 값을 입력했을 때 얻어지는 결과를 y^ 라고한다. y^는 모델이 만들어낸 예측값이다.
#### y^ = y

### Linear Regression 종류
* 단순 선형 회귀
    * 단순 선형 회귀는 독립변수가 1개인 선형 회귀를 말한다. y = wx + b

* 다중 선형 회귀
    * feature가 2개 이상인 선형 회귀를 말한다. y = w1x1 + w2x2 + ... +b

### Cost function
* 실제값 y와 예측값 y^의 차이를 오차(error)라고 한다.
* 기계학습은 실제값과 예측값의 오차(error)를 최소화하는 w와 b를 찾아가는 과정이다.
* w, b와 오차의 관계를 함수로 나타낸 것을 비용함수(cost function) 또는 손실함수(Loss function)이라고 부른다.

### 평균제곱오차(MSE, Mean Squared Errors)
* 오차를 전체적으로 줄여야 예측의 정확도가 높아진다.
* MSE = n/1 * sigma(n)(i = 1) (yi = y^i)

### Optimizer
* 비용함수를 최소화하는 w와 b를 구하는 최적화 알고리즘을 옵티마이저라 부른다.
    * 경사하강법(Gradient Descent)
        * 가장 기본적인 옵티마이저이다.
        * 비용함수의 기울기가 작아지는 방향으로 w와 b를 업데이트 한다.
        * 비용함수를 그래프로 그리면 x ** 2 그래프가 된다.
    

In [7]:
#경사하강법
import numpy as np

x_data = np.array([1,2,3,4,5]).reshape(5,1) #5행 1열 2차원 데이터로 변환
t_data = np.array([2,3,4,5,6]).reshape(5,1) # 레이블
w = np.random.rand(1,1) # 기울기, 2차원 , 1미만의 난수생성
b = np.random.rand(1) # bias, 1차원, 1미만의 난수생성

def loss_func(x,t): #손실함수(MSE)
    y = np.dot(x,w) + b #dot은 행렬연산자
    return (np.sum((t - y)**2)) / (len(x))


def numerial_derivative(f,x): #수치미분 함수
    delta_x = 1e-4 #0.0001
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + delta_x
        fx1 = f(x) # f(x+delta_x)
        
        x[idx] = float(tmp_val) - delta_x
        fx2 = f(x)
        grad[idx] = (fx1 - fx2) / (2*delta_x)
        
        x[idx] = tmp_val
        it.iternext()
    return grad

learning_rate = 1e-2 #학습률 알파
f = lambda x: loss_func(x_data, t_data)
print('Initial loss value = ', loss_func(x_data, t_data), "Initial w =", w, '\n',',b = ',b)
for step in range(6001):
    w -= learning_rate * numerial_derivative(f,w)
    b -= learning_rate * numerial_derivative(f,b)
    if (step % 300 == 0): #추가부분
        print("step = ", step, "loss value = ", loss_func(x_data, t_data), "W =", w, "b =", b)

wb [[0.45471849]] [0.80007115]
Initial loss value =  3.964727708594824 Initial w = [[0.45471849]] 
 ,b =  [0.80007115]
step =  0 loss value =  2.3328831126996543 W = [[0.58667616]] b = [0.82886916]
step =  300 loss value =  0.00013768789406656405 W = [[1.00761982]] b = [0.97249677]
step =  600 loss value =  1.7480000782876963e-05 W = [[1.00271499]] b = [0.99020044]
step =  900 loss value =  2.2191524493906474e-06 W = [[1.00096736]] b = [0.99650836]
step =  1200 loss value =  2.8172982683519627e-07 W = [[1.00034468]] b = [0.99875591]
step =  1500 loss value =  3.576667089740561e-08 W = [[1.00012281]] b = [0.99955672]
step =  1800 loss value =  4.540714632345658e-09 W = [[1.00004376]] b = [0.99984206]
step =  2100 loss value =  5.764609580783928e-10 W = [[1.00001559]] b = [0.99994372]
step =  2400 loss value =  7.318390673760429e-11 W = [[1.00000556]] b = [0.99997995]
step =  2700 loss value =  9.290974748715367e-12 W = [[1.00000198]] b = [0.99999286]
step =  3000 loss value =  1.1795245

In [9]:
def predict(x):
    y = np.dot(x,w) + b
    return y

predict(np.array([43]))

array([44.])

In [8]:
import numpy as np

a = np.array([[1,2,3,4], [5,6,7,8]])

print(a, '\n')
print('a.shape == ', a.shape, '\n')

it = np.nditer(a, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
    idx = it.multi_index
    print(idx, 'current value => ', a[idx])
    it.iternext()


[[1 2 3 4]
 [5 6 7 8]] 

a.shape ==  (2, 4) 

(0, 0) current value =>  1
(0, 1) current value =>  2
(0, 2) current value =>  3
(0, 3) current value =>  4
(1, 0) current value =>  5
(1, 1) current value =>  6
(1, 2) current value =>  7
(1, 3) current value =>  8


# 입력변수가 2개 이상인 선형회귀 예제(다중 회귀)

In [16]:
import numpy as np
loaded_data = np.loadtxt('./data1.csv', delimiter=',', dtype=np.float32)
x_data = loaded_data[:,0:-1]
t_data = loaded_data[:,[-1]]

np.random.seed(42)
w = np.random.rand(3,1)
b = np.random.rand(1)
print("W = ", w, ", W.shape = ", w.shape, ", b = ", b, ", b.shape = ", b.shape)

learning_rate = 1e-5

f = lambda x: loss_func(x_data,t_data)
print("Initial loss value = ", loss_func(x_data, t_data) )
for step in range(30001):
    w -= learning_rate * numerial_derivative(f,w)
    b -= learning_rate * numerial_derivative(f,b)
    if(step % 3000 == 0):
        print("step = ", step, "loss value = ", loss_func(x_data, t_data) )

W =  [[0.37454012]
 [0.95071431]
 [0.73199394]] , W.shape =  (3, 1) , b =  [0.59865848] , b.shape =  (1,)
Initial loss value =  18.912018958218844
step =  0 loss value =  11.964774630423882
step =  3000 loss value =  4.418931926580983
step =  6000 loss value =  3.8705650947081622
step =  9000 loss value =  3.718669962755764
step =  12000 loss value =  3.667505157681574
step =  15000 loss value =  3.6490087100776845
step =  18000 loss value =  3.642106732530343
step =  21000 loss value =  3.639430652979643
step =  24000 loss value =  3.638306266857524
step =  27000 loss value =  3.6377530185756117
step =  30000 loss value =  3.637410373246306


In [15]:
loaded_data

array([[ 73.,  80.,  75., 152.],
       [ 93.,  88.,  93., 185.],
       [ 89.,  91.,  90., 180.],
       [ 96.,  98., 100., 196.],
       [ 73.,  66.,  70., 142.],
       [ 53.,  46.,  55., 101.],
       [ 69.,  74.,  77., 149.],
       [ 47.,  56.,  60., 115.],
       [ 87.,  79.,  90., 175.]], dtype=float32)

In [17]:
test_data = np.array([100, 98, 81])
predict(test_data)

array([179.13680055])

# Scikit-learn으로 다중회귀 실습

In [65]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore') # 경고메세지 무시함

data = load_boston()
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=42)

from sklearn.linear_model import LinearRegression


from sklearn.preprocessing import MinMaxScaler #minmax 스케일링
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_minmax = scaler.transform(x_train)

from sklearn.preprocessing import StandardScaler #Standard 스케일링
scaler_St = StandardScaler()
scaler_St.fit(x_train)
x_train_St = scaler_St.transform(x_train)

model = LinearRegression()
model.fit(x_train, y_train) #모델 학습


#모델 예측 및 평가
score = model.score(x_test, y_test)
print('정확도: ', score)

coefficient = model.coef_  #가중치 값
intercept = model.intercept_ #bias 값

print('계수(w값): ',coefficient)
print('절편: ',intercept)


정확도:  0.6844267283527141
계수(w값):  [-1.28322638e-01  2.95517751e-02  4.88590934e-02  2.77350326e+00
 -1.62388292e+01  4.36875476e+00 -9.24808158e-03 -1.40086668e+00
  2.57761243e-01 -9.95694820e-03 -9.23122944e-01  1.31854199e-02
 -5.17639519e-01]
절편:  29.83642016383914


## 단순 회귀 분석

In [48]:
import pandas as pd
df = pd.DataFrame(data=data.data, columns=data.feature_names)
df['target_names'] = data.target
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target_names
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0
