# Logistic Regression 
- 머신러닝에는 지도학습 / 비지도학습 / 강화학습이 있고
- 지도학습에는 회귀(Regression) / 분류(classification)
- 이전까지 선형, 다중, 다항 회귀분석을 배웠는데, 회귀는 연속적인 값들을 예측할 때 주로 사용하고
- 분류는 정해진 몇 개의 값 중에서 예측하는거 !  ex) 스팸=0/ 일반메일=1 
- 선형회귀로도 분류가 가능하지만, 예외적인 데이터에 대해서 민감하게 반응하기 때문에 분류 문제에는 선형회귀 사용 X
- 로지스틱 회귀인 이유는 결과값을 시그모이드 값이 0과 1사이의 값만 준다고 해도, 어쨌든 0과 1 사이의 연속적인 값중에 하나를 주니까 회귀 ㅎ
- 그래도 사용하는 건 분류 

In [2]:
## Logistic Regression 가정함수 구현하기

In [5]:
import numpy as np

In [6]:
# 입력 변수
hours_studied = np.array([0.2, 0.3, 0.7, 1, 1.3, 1.8, 2, 2.1, 2.2, 3, 4, 4.2, 4, 4.7, 5.0, 5.9])  # 공부 시간 (단위: 100시간)
gpa_rank = np.array([0.9, 0.95, 0.8, 0.82, 0.7, 0.6, 0.55, 0.67, 0.4, 0.3, 0.2, 0.2, 0.15, 0.18, 0.15, 0.05]) # 학년 내신 (백분률)
number_of_tries = np.array([1, 2, 2, 2, 4, 2, 2, 2, 3, 3, 3, 3, 2, 4, 1, 2])  # 시험 응시 횟수

### sigmoid 함수
- 시그모이드 함수 안에 numpy 배열을 넣으면 모든 원소를 시그모이드 함수에 넣은 결과 값을 구할 수 있다

In [16]:
def sigmoid(x):
    """시그모이드 함수"""
    return 1 / (1 + np.exp(-x))

In [17]:
np_array_1 = np.array([-3,-2,-1,0,1,2,3])
sigmoid(np_array_1)

array([0.04742587, 0.11920292, 0.26894142, 0.5       , 0.73105858,
       0.88079708, 0.95257413])

In [23]:
def prediction (X, theta):
    """로지스틱 회귀가정 함수"""
    return sigmoid(X@theta)

In [24]:
### 설계 행렬 X의 정의
X=np.array([np.ones(16), hours_studied, gpa_rank,number_of_tries]).T
### parameter theta 의 정의
theta = [0.5, 0.3,-2,0.2]

In [27]:
prediction(X,theta)

array([0.26114999, 0.28699984, 0.37989357, 0.39174097, 0.57199613,
       0.55971365, 0.59868766, 0.54735762, 0.72312181, 0.80218389,
       0.86989153, 0.87653295, 0.85814894, 0.91293423, 0.86989153,
       0.9289057 ])

# sklearn - Logistic Regression 
- 로지스틱 회귀(분류)

In [20]:
from sklearn.datasets import load_iris
# 분류문제
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 

In [4]:
iris_data=load_iris()
iris_data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [9]:
print(iris_data.DESCR) # 어떤 종류의 붓꽃인지 예측하는 게 목표

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [15]:
#데이터 프레임에 넣어주기
x=pd.DataFrame(iris_data.data, columns=iris_data.feature_names)

In [16]:
x

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [19]:
#목표 변수 정리 
y=pd.DataFrame(iris_data.target, columns=['class'])
y

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


In [28]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=5)

In [29]:
# 로지스틱 회귀할 때 써주면 경고안뜸
y_train=y_train .values.ravel()

In [30]:
LogisticRegression(solver='saga', max_iter=2000) 

LogisticRegression(max_iter=2000, solver='saga')

- solver=모델 최적화할 때 어떤 알고리즘 쓸지 결정하는거
- max_iter= 모델을 최적화 할 때 그 과정을 몇 번 반복할지 결정하는거/ 2000번 써놔도 충분히 최적화가 되었다고 판단이 되면 알아서 멈춤
- 학습률 alpha는 알아서 최적화 되어 있으니 따로 써줄 필요 없음

In [31]:
model= LogisticRegression(solver='saga',max_iter=2000)

In [32]:
model.fit(x_train,y_train)

LogisticRegression(max_iter=2000, solver='saga')

In [33]:
model.predict(x_test) # 분류 문제니까 예측 값이 0,1,2

array([1, 2, 2, 0, 2, 1, 0, 2, 0, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2,
       0, 1, 1, 2, 1, 1, 1, 2])

In [36]:
model.score(x_test,y_test) # 모델이 약 96% 의 정도로 제대로 분류하는 것 의미

0.9666666666666667

# 와인 종류 구분하기

In [65]:
# 필요한 라이브러리 불러오기
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd


In [66]:
wine_data = datasets.load_wine()

In [67]:
print(wine_data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [68]:
x= pd.DataFrame(wine_data.data, columns=wine_data.feature_names)

In [69]:
x

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [70]:
y= pd.DataFrame(wine_data.target)

In [71]:
y

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
173,2
174,2
175,2
176,2


In [72]:
x_train, x_test, y_train, y_test= train_test_split(x,y,test_size=0.2, random_state=5)

In [73]:
model=LogisticRegression(solver='saga',max_iter=7500)

In [74]:
y_train = y_train.values.ravel()

In [75]:
model.fit(x_train,y_train)

LogisticRegression(max_iter=7500, solver='saga')

In [76]:
model.predict(x_test)

array([0, 1, 0, 0, 2, 2, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 0, 2, 2])

In [77]:
score= model.score(x_test,y_test)
score

0.75