In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, SGDClassifier
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import math

# Logistic Regression
- Linear regression을 통해 얻은 y_hat (prediction value)를 사용해서 특정 **class에 포함될 확률**이 얼마인지를 추정한다.
- **Sigmoid function**을 사용한다.
- 해당 확률을 이용하여 **classification**에 사용한다.

$p:$ 해당 class일 확률  
$1 - p:$ 해당 class가 아닐 확률

odd_ratio = $p / (1 - p)$  
logit(function), z = $log$(odd_ratio) = $log(p/(1-p))$  
sigmoid, p = $1 / (1 + e^{-z})$  

# LinearRegression
- **y_hat** = $a*x + b$
- **Loss of function** = SSE, MSE

# LogisticRegression
- **z** = sigmoid($a*x + b$)
- **Loss of function** = Log Loss
- $-ylog{z} - (1-y)log{(1-z)}$
  - $y$: Target value (0, 1)  
    (Class에 속하거나 아니거나)
  - $z$: Predict value

- $y == 0$:  
  $-log(1-z)$
- $y == 1$:  
  $-log(z)$

- **Penalty**: 'none', 'l1', 'l2'(default), 'elasticnet'  
  ('l1': Lasso regulation, 'l2': Ridge regulation, 'elasticnet': Both regulation)

- **Solver**: ‘newton-cg’, ‘lbfgs’(default), ‘liblinear’, ‘sag’, ‘saga’  
  

# Cross_entropy
- Log Loss보다 더 발전한 형태의 loss of function

# Import Iris data

In [12]:
iris = datasets.load_iris()
X = iris.data
y = iris.target     # class 값 (0, 1, 2)

print(iris.DESCR)
print(y)

#X_train, X_test, y_train, y_test = train_test_split(X, y)   # stratify
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)   # stratify

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

# Stratify
- 각 class마다 같은 비율로 training, testing data를 분리한다.
- stratify를 설정하지 않을 경우, class마다 training, testing dataset의 비율에 있어서 차이가 존재한다.

- coef_, intercept_ 의 set은 target의 수(class의 수)에 따라 결정된다.


# Class_0 일 확률을 내는 Model (Example)
- z = $-0.448*a + 0.860*b -2.370*c -1.008*d + 9.622$  
  (decision_function)

In [3]:
l1 = LogisticRegression(solver = 'sag').fit(X_train, y_train)
print(l1.score(X_test, y_test))
print(l1.coef_)
print(l1.intercept_)

0.9736842105263158
[[ 0.57416299  1.45268554 -2.17534785 -0.98715642]
 [ 0.27967105 -0.15925862  0.09468578 -0.9893571 ]
 [-0.85383404 -1.29342692  2.08066206  1.97651352]]
[ 1.37059634  1.53846864 -2.90906499]




- Stochastic Average Gradient (SAG) algorithm이 가장 score가 높게 나왔다.
- 확률이 가장 높은 class로 결정한다.
- decision_function의 결과값이 y_hat 값을 sigmoid function에 넣은 값이다.
- 따라서, decision_function이 가장 높은 class로 결정한다.

In [13]:
d = l1.decision_function(X_test)
p = l1.predict_proba(X_test)  # probability
print(d[0])

print(np.dot(X_test[0], l1.coef_[0]) + l1.intercept_[0])    # class 0일 가능성 점수
print(np.dot(X_test[0], l1.coef_[1]) + l1.intercept_[1])    # class 1일 가능성 점수
print(np.dot(X_test[0], l1.coef_[2]) + l1.intercept_[2])    # class 2일 가능성 점수

print(mysig(d[0]))
print(p[0])

print('Predict: ',np.argmax(p, axis = 1))  # 가장 높은 확률을 가진 class를 도출   # 예측 값
print('Predict: ',l1.predict(X_test))                                             # 예측 값
print('Target: ',y_test)                                                         # 실제 정답

[ 4.90102032  2.10552481 -7.00654513]
4.901020323322449
2.1055248089547858
-7.006545132277051
[9.92615941e-01 8.91439002e-01 9.05113095e-04]
[9.42425939e-01 5.75677101e-02 6.35122152e-06]
Predict:  [0 2 0 2 2 0 2 2 2 0 0 2 1 0 0 0 1 1 2 2 1 2 2 0 1 0 1 0 1 2 1 1 0 1 1 2 2
 0]
Predict:  [0 2 0 2 2 0 2 2 2 0 0 2 1 0 0 0 1 1 2 2 1 2 2 0 1 0 1 0 1 2 1 1 0 1 1 2 2
 0]
Target:  [0 2 0 2 2 0 2 2 2 0 0 2 1 0 0 0 1 1 1 2 1 2 2 0 1 0 1 0 1 2 1 1 0 1 1 2 2
 0]


# Sigmoid function

In [7]:
def mysig(z):
  return 1/(1+np.exp(-z))


# Stochastic Gradient Descent Classifier (SGDClassifier)
- Logistic Regression의 하위버전 (범용)
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier

In [8]:
l2 = SGDClassifier(loss = 'log')  # loss = 'log' -> log loss, loss of function of logistic regression
l2.fit(X_train, y_train)
l2.score(X_test, y_test)

0.9473684210526315

# Application
- Wine의 특징을 통해 wine의 class 분류하기
- independent variables: Alcohol, color intensity, Hue
- Target: class

- Logistic Regression or SGDClassifier

In [9]:
wine = datasets.load_wine(as_frame = True)
print(wine.data)

     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0      14.23        1.71  2.43               15.6      127.0           2.80   
1      13.20        1.78  2.14               11.2      100.0           2.65   
2      13.16        2.36  2.67               18.6      101.0           2.80   
3      14.37        1.95  2.50               16.8      113.0           3.85   
4      13.24        2.59  2.87               21.0      118.0           2.80   
..       ...         ...   ...                ...        ...            ...   
173    13.71        5.65  2.45               20.5       95.0           1.68   
174    13.40        3.91  2.48               23.0      102.0           1.80   
175    13.27        4.28  2.26               20.0      120.0           1.59   
176    13.17        2.59  2.37               20.0      120.0           1.65   
177    14.13        4.10  2.74               24.5       96.0           2.05   

     flavanoids  nonflavanoid_phenols  proanthocyan

# Classifier

In [10]:
X, y = wine.data[['alcohol', 'color_intensity', 'hue']].values, wine.target.values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

wine_classifier1 = LogisticRegression(max_iter = 10000, solver = 'newton-cg').fit(X_train, y_train)
print('Newton-cg Score: ', wine_classifier1.score(X_test, y_test))

wine_classifier2 = LogisticRegression(max_iter = 10000, solver = 'lbfgs').fit(X_train, y_train)
print('lbfgs Score: ', wine_classifier2.score(X_test, y_test))

wine_classifier3 = LogisticRegression(max_iter = 10000, solver = 'liblinear').fit(X_train, y_train)
print('liblinear Score: ', wine_classifier3.score(X_test, y_test))

wine_classifier4 = LogisticRegression(max_iter = 10000, solver = 'sag').fit(X_train, y_train)
print('sag Score: ', wine_classifier4.score(X_test, y_test))

wine_classifier5 = LogisticRegression(max_iter = 10000, solver = 'saga').fit(X_train, y_train)
print('saga test Score: ', wine_classifier5.score(X_test, y_test))
print('saga train Score: ', wine_classifier5.score(X_train, y_train))

wine_classifier6 = SGDClassifier(loss = 'log', max_iter = 10000).fit(X_train, y_train)
print('SGD test Score: ', wine_classifier6.score(X_test, y_test))
print('SGD train Score: ', wine_classifier6.score(X_train, y_train))

Newton-cg Score:  0.8222222222222222
lbfgs Score:  0.8222222222222222
liblinear Score:  0.6888888888888889
sag Score:  0.8
saga test Score:  0.7333333333333333
saga train Score:  0.8796992481203008
SGD test Score:  0.6888888888888889
SGD train Score:  0.7368421052631579


- shuffle와 stratify는 함께 사용이 불가능하다.
- shuffle = True를 했을 경우, training data와 testing data에 특정 class가 몰리면 학습과 예측이 제대로 이루어지지 않는다.
