# Tobig's 19기 2주차 Optimization 과제

# Gradient Descent 구현하기

### 1)"..."표시되어 있는 빈 칸을 채워주세요
### 2)강의내용과 코드에 대해 공부한 내용을 마크마운 또는 주석으로 설명해주세요

## 데이터

In [1]:
import pandas as pd
import numpy as np
import random
import math  #지수사용할 때 사용

In [2]:
data = pd.read_csv('assignment_2.csv')
data.head()

Unnamed: 0,Label,bias,experience,salary
0,1,1,0.7,48000
1,0,1,1.9,48000
2,1,1,2.5,60000
3,0,1,4.2,63000
4,0,1,6.0,76000


## Train Test 데이터 나누기

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0], test_size = 0.25, random_state = 0)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((150, 3), (50, 3), (150,), (50,))

## Scaling

experience와 salary의 단위, 평균, 분산이 크게 차이나므로 scaler를 사용해 단위를 맞춰줍니다. 

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
bias_train = X_train["bias"] 
bias_train = bias_train.reset_index()["bias"]   #인덱스 맞춰줌.
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train["bias"] = bias_train  #라벨값 저장한거 다시 사용.
X_train.head()

Unnamed: 0,bias,experience,salary
0,1,0.187893,-1.143335
1,1,1.185555,0.043974
2,1,-0.310938,-0.351795
3,1,-1.629277,-1.34122
4,1,-1.3086,0.043974


이때 scaler는 X_train에 fit 해주시고, fit한 scaler를 X_test에 적용시켜줍니다.  
똑같이 X_test에다 fit하면 안돼요!

In [7]:
bias_test = X_test["bias"]
bias_test = bias_test.reset_index()["bias"]
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_test["bias"] = bias_test
X_test.head()

Unnamed: 0,bias,experience,salary
0,1,-1.344231,-0.615642
1,1,0.50857,0.307821
2,1,-0.310938,0.571667
3,1,1.363709,1.956862
4,1,-0.987923,-0.747565


In [8]:
# parameter 개수
N = len(X_train.loc[0])

In [9]:
# 초기 parameter들을 임의로 설정해줍니다.
parameters = np.array([random.random() for i in range(N)])
random_parameters = parameters.copy()
parameters

array([0.64797435, 0.57814249, 0.94244958])

### * LaTeX   

Jupyter Notebook은 LaTeX 문법으로 수식 입력을 지원하고 있습니다.  
LaTeX문법으로 아래의 수식을 완성해주세요  
http://triki.net/apps/3466  
https://jjycjnmath.tistory.com/117

## Dot product
## $z = X_i \theta$

In [10]:
def dot_product(X, parameters):
    z = 0
    for i in range(len(parameters)):
        z += parameters[i]*X[i]
    return z   #내적이니까 그냥 맞는 번호끼리 곱한 걸 return

## Logistic Function

## $p = "..."$

In [11]:
def logistic(X, parameters):
    z = dot_product(X,parameters)
    p = 1/(1+math.exp(z))
    return p #1/1+e^z

In [12]:
logistic(X_train.iloc[1], parameters)

0.20183934219177135

## Object function

Object Function : 목적함수는 Gradient Descent를 통해 최적화 하고자 하는 함수입니다.  
<br>
선형 회귀의 목적함수
## $l(\theta) = \frac{1}{2}\Sigma(y_i - \theta^{T}X_i)^2$  
참고) $\hat{y_i} = \theta^{T}X_i$
  
로지스틱 회귀의 목적함수를 작성해주세요  
(선형 회귀의 목적함수처럼 강의에 나온대로 작성해주세요. 평균을 고려하는 것은 뒤에 코드에서 수행합니다)
## $l(p) =$ "..."

In [13]:
def minus_log_cross_entropy_i(X, y, parameters):
    p = logistic(X,parameters)
    loss= -y * math.log(p) - (1 - y) * math.log(1 - p)
    return loss 

In [14]:
def mse_i(X, y, parameters):
    y_hat = dot_product(X,parameters)
    loss = (y-y_hat)**2/2
    return loss

In [15]:
def batch_loss(X_set, y_set, parameters, loss_function, n): #n:현재 배치의 데이터 수
    loss = 0
    for i in range(X_set.shape[0]):
        X = X_set.iloc[i,:]
        y = y_set.iloc[i]
        loss += loss_function(X,y,parameters)
    loss = loss/n #loss 평균값으로 계산
    return loss

In [16]:
batch_loss(X_test, y_test, parameters, minus_log_cross_entropy_i, len(X_test))

0.7108124539734206

## Gradient
위의 선형회귀의 목적함수 $l(\theta)$와 로지스틱회귀의 목적함수 $l(p)$의 gradient를 작성해주세요  
(위의 목적함수를 참고해서 작성해주세요 = 평균을 고려하는 것은 뒤에 코드에서 수행합니다)

## ${\partial\over{\partial \theta_j}}l(\theta)=$ "..."
## ${\partial\over{\partial \theta_j}}l(p)=$ "..."

In [17]:
def get_gradient_ij(X, y, parameters, j, model):
    if model == 'linear':
        y_hat = dot_product(X,parameters)
        gradient = parameters[j]*(y_hat-y)
    else:
        p = logistic(X,parameters)
        gradient = parameters[j]*(y-p)
    return gradient

In [18]:
get_gradient_ij(X_train.iloc[0,:], y_train.iloc[0], parameters, 1, 'logistic')

0.2430791811482231

## Batch Gradient
하나의 배치 (X_set, y_set)에 대해 기울기를 구하는 코드를 작성해주세요

In [27]:
def batch_gradient(X_set, y_set, parameters, model):
    gradients = [0 for _ in range(len(parameters))]
    
    for i in range(len(X_set)):
        X = X_set.iloc[i,:]
        y = y_set.iloc[i]
        for j in range(len(parameters)):
            gradients[j] += get_gradient_ij(X,y,parameters,j,logistic)
    
    return gradients

In [28]:
gradients1 = batch_gradient(X_train, y_train, parameters, 'logistic')
gradients1

[-10.920020812952352, -9.743175745056941, -15.882679571853128]

## mini-batch
인덱스로 미니 배치 나누기

In [38]:
def batch_idx(X_train, batch_size):
    N = len(X_train)
    nb = (N // batch_size)+1 #number of batch
    idx = np.array([i for i in range(N)])
    idx_list = [idx[i*batch_size:(i+1)*batch_size] for i in range(nb) if len(idx[i*batch_size:(i+1)*batch_size]) != 0]
    return idx_list

batch_idx 함수에 대한 설명을 batch_size와 함께 간략하게 작성해주세요  
### 설명: idx_list는 이제 batch_size에 맞게 쪼개서 np.array로 저장을 해주는 것이다. 따라서 편하게 idx_list에 있는 걸 반복문에 돌려주면 batch_gradient를 구할 수 있다.

## Update Parameters
기울기를 갱신하는 코드를 작성해주세요  
(loss와 마찬가지로 기울기를 갱신할 때 배치 사이즈를 고려해 평균으로 갱신해주세요)

In [34]:
def step(parameters, gradients, learning_rate, n): #n:현재 배치의 데이터 수
    for i in range(len(parameters)):
        gradients[i] *= learning_rate/n
    
    parameters -= gradients
    return parameters

In [45]:
parameters

array([0.64870235, 0.57879203, 0.94350842])

In [46]:
step(parameters, gradients1, 0.01, len(X_train))

array([0.6487024 , 0.57879207, 0.94350849])

## Gradient Descent
위에서 작성한 함수들을 조합해서 경사하강법 함수를 완성해주세요

- learning_rate: 학습률  
- tolerance: Step이 너무 작아서 더 이상의 학습이 무의미할 때 학습을 멈추는 조건  
- batch: 기울기를 1번 갱신할 때 사용하는 데이터셋  
- epoch:  
- num_epoch:
<br>

BGD: "batch gradient descent"   전체 데이터셋
SGD: "stochastic gradient descent"  1개씩 처리
MGD: "mini-batch gradient descent"  batch_size 갯수만큼 처리 뒤에 두개를 섞음.
<br>
batch_size에 따른 경사하강법의 종류를 적어주세요  
batch_size=1 -> "SGD"  
batch_size=k -> "MGD"  
batch_size=whole -> "BGD"  

In [43]:
def gradient_descent(X_train, y_train, learning_rate = 0.1, num_epoch = 1000, tolerance = 0.00001, model = 'logistic', batch_size = 16):
    stopper = False
    
    N = len(X_train.iloc[0])
    parameters = np.random.rand(N)
    loss_function = minus_log_cross_entropy_i if model == 'logistic' else mse_i
    loss = 999
    batch_idx_list = batch_idx(X_train, batch_size)
    
    for epoch in range(num_epoch):
        if stopper:
            break
        for idx in batch_idx_list:
            X_batch = X_train.iloc[idx,]
            y_batch = y_train.iloc[idx]
            gradients = batch_gradient(X_batch,y_batch,parameters,model)
            parameters = step(parameters,gradients,learning_rate,batch_size)
            new_loss = batch_loss(X_batch,y_batch,parameters,loss_function,batch_size)
            
            #중단 조건
            if abs(new_loss - loss) < tolerance:
                stopper = True
                break
            loss = new_loss
        
        #100epoch마다 학습 상태 출력
        if epoch%100 == 0: #출력이 길게 나오면 check point를 수정해도 됩니다.
            print(f"epoch: {epoch}  loss: {new_loss}  params: {parameters}  gradients: {gradients}")
    
    return parameters

## Implement
경사하강법 함수를 이용해 최적의 모수 찾아보세요. 학습을 진행할 때, Hyper Parameter를 바꿔가면서 학습시켜보세요.

## Logistic Regression

In [44]:
new_param_bgd = gradient_descent(X_train, y_train, 0.01,1000,0.0001,logistic,16)
new_param_bgd

epoch: 0  loss: 0.8203341423362502  params: [0.84664104 0.96185333 0.48791895]  gradients: [9.954958577586994e-05, 0.00011309645548104348, 5.737039324336483e-05]
epoch: 100  loss: 2.6069543777571873  params: [1.49262617 1.69574516 0.86019996]  gradients: [0.0006707294026066438, 0.000762003351701707, 0.00038654113211546626]
epoch: 200  loss: 6.407944867655927  params: [2.32270153 2.63877854 1.33857212]  gradients: [0.0013405401983295807, 0.001522963090999429, 0.0007725528713887269]
epoch: 300  loss: 15.705050725794266  params: [3.6161462  4.10823726 2.08398385]  gradients: [0.0022352442189021576, 0.0025394198913242035, 0.0012881705015035682]
epoch: 400  loss: 42.08515554200506  params: [5.89492972 6.69712135 3.39724604]  gradients: [0.003684774640538229, 0.004186204772660382, 0.0021235344024113194]
epoch: 500  loss: 122.92512608063922  params: [10.04581832 11.41286963  5.78940176]  gradients: [0.006282549917494821, 0.007137489538641723, 0.003620631432352102]
epoch: 600  loss: 372.903655

OverflowError: math range error

In [48]:
new_param_sgd = gradient_descent(X_train, y_train, 0.01,1000,0.00001,'linear',16)
new_param_sgd

epoch: 0  loss: 0.2885798720604242  params: [0.48090899 0.2569726  0.58051651]  gradients: [-0.00014150703246484448, -7.561395497495832e-05, -0.00017081645625630895]
epoch: 100  loss: 1.4053403868497958  params: [1.06883777 0.57113099 1.29021913]  gradients: [0.00029367729717944956, 0.00015692578339957186, 0.00035450475014086116]
epoch: 200  loss: 3.5906421717251122  params: [1.70098759 0.90891878 2.053302  ]  gradients: [0.0008405491031022276, 0.0004491454659824306, 0.0010146465274580027]
epoch: 300  loss: 7.158641969967798  params: [2.39400057 1.27922866 2.88985422]  gradients: [0.0013836295704438904, 0.0007393392555776822, 0.001670211691093026]
epoch: 400  loss: 12.67676604727849  params: [3.1784353  1.69838954 3.83676377]  gradients: [0.001937807667382396, 0.0010354630378386536, 0.0023391730635775793]
epoch: 500  loss: 20.698347158061267  params: [4.05468739 2.16661281 4.8945082 ]  gradients: [0.0025164922521573193, 0.001344681805101997, 0.0030377167920383]
epoch: 600  loss: 31.520

array([ 8.8321153 ,  4.71942035, 10.66145343])

In [None]:
new_param_mgd = gradient_descent(X_train, y_train, "...")
new_param_mgd

### 어디인지 문제를 정확히 찾지 못해서 제출 후 다시 찾아봐야 할 거 같아서 여기까지 하고 제출합니다.

In [None]:
y_predict = []
for i in range(len(y_test)):
    p = logistic(X_test.iloc[i,:], new_param_bgd)
    if p> 0.5 :
        y_predict.append(1)
    else :
        y_predict.append(0)
y_predict_random = []
for i in range(len(y_test)):
    p = logistic(X_test.iloc[i,:], random_parameters)
    if p> 0.5 :
        y_predict_random.append(1)
    else :
        y_predict_random.append(0)

### Confusion Matrix

In [None]:
from sklearn.metrics import *

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
confusion_matrix(y_test, y_predict)

In [None]:
accuracy = (tp+tn) / (tp+fn+fp+tn)
print("accuracy:",accuracy)

## Linear regression
### $y = 0.5 + 2.7x$

### Data

In [None]:
raw_X = np.random.rand(150)
y = 2.7*raw_X + 0.5 + np.random.randn(150)

In [None]:
tmp = np.array([1 for _ in range(150)])
X = np.vstack((tmp, raw_X)).T
X = pd.DataFrame(X)
y = pd.Series(y)

### Estimation

In [None]:
#정규방정식
theta = np.linalg.inv(np.dot(X.T,X)).dot(X.T).dot(y)
theta

In [None]:
#경사하강법
new_param = gradient_descent(X, y, "...")
new_param

In [None]:
y_hat_NE = theta.dot(X.T)
y_hat_GD = new_param.dot(X.T)

### Visualization
시각화를 통해 정규방정식과 경사하강법을 통한 선형회귀를 비교해보세요  
(밑의 코드를 실행만 시키면 됩니다. 추가 코드 x)

In [None]:
import matplotlib.pyplot as plt
plt.plot(X.iloc[:,1], y, '.k') #산점도
plt.plot(X.iloc[:,1], y_hat_NE, '-b', label = 'NE') #정규방정식
plt.plot(X.iloc[:,1], y_hat_GD, '-r', label = 'GD') #경사하강법
plt.legend()
plt.show()