# 지도학습 - 분류

**이진분류** : 주어진 데이터에 대해 두 가지 중 하나로 분류하는 것 <br>
**다중분류** : 주어진 데이터에 대해 여러 가지 중 하나로 분류하는 것

## 1. 의사결정트리
- **수치형, 범주형 데이터 모두 사용 가능**
- 과대적합 주의 필요
- 부트스트랩 기반 샘플링을 활용한 의사결정나무 생성 후 배깅 기반 나무들을 모아 앙상블 학습하면 랜덤포레스트가 됨
- 범주형 데이터 - 레이블 인코딩

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
df.shape

(891, 12)

In [9]:
df.info()
# target : Survived
# object column : Name, Sex(인코딩), Ticket, Cabin(제거), Embarked(인코딩)
# 결측치 : Age, Cabin, Embarked

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


### 결측치 처리

In [12]:
# 결측치 : Age(평균값 대체), Cabin(컬럼 삭제), Embarked(최빈값 대체)
df.drop(columns='Cabin', inplace=True)  # Cabin 컬럼 삭제
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [16]:
print(df['Age'].isna().sum())
df['Age'].fillna(df['Age'].mean(), inplace=True)  # Age 결측값 대체
print(df['Age'].isna().sum())

177
0


In [18]:
print(df['Embarked'].isna().sum())
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)  # Embarked 결측값 대체
print(df['Embarked'].isna().sum())

2
0


### 범주형 변수 처리 : 레이블 인코딩

In [30]:
# Sex, Embarked 
from sklearn.preprocessing import LabelEncoder  # 레이블인코더 import

df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

인코딩 후 라벨과 인코딩 값 확인 방법

In [78]:
from sklearn.preprocessing import LabelEncoder

# 레이블 인코더 객체 생성
label_encoder = LabelEncoder()

# 라벨 인코딩 수행
labels = ["사과", "바나나", "체리", "사과"]
encoded_labels = label_encoder.fit_transform(labels)

# 라벨과 인코딩된 값의 매핑 확인
label_mapping = dict(zip(labels, encoded_labels))
print(label_mapping)

{'사과': 1, '바나나': 0, '체리': 2}


### 파생변수 생성

In [33]:
# SibSp, Parch
df['FamilySize'] = df['SibSp']+df['Parch']
df.drop(columns=['SibSp', 'Parch'], inplace=True)  # 책에선 안 지웠음 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   Ticket       891 non-null    object 
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    int64  
 9   FamilySize   891 non-null    int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 69.7+ KB


### 분석 데이터셋 준비

In [37]:
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)  # 필요없는 컬럼 제거
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,1,22.0,7.25,2,1
1,1,1,0,38.0,71.2833,0,1
2,1,3,0,26.0,7.925,2,0
3,1,1,0,35.0,53.1,2,1
4,0,3,1,35.0,8.05,2,0


In [39]:
X = df.drop(columns='Survived')
y = df['Survived']

X.shape, y.shape

((891, 6), (891,))

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(712, 6) (179, 6)
(712,) (179,)


### 데이터 분석

In [43]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()  # 객체 생성
tree.fit(X_train, y_train)  # 학습

In [45]:
pred = tree.predict(X_test)  # 예측

### 성능 평가

In [48]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print('accuracy :', accuracy_score(y_test, pred))

[[94 16]
 [21 48]]
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.70      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

accuracy : 0.7932960893854749


## 2. KNN (K-Nearest Neighbor)
- 데이터로부터 거리가 가까운 K개의 다른 데이터 목표값을 참조하여 분류
- **변수별 단위가 무엇이냐에 따라 결과가 달라짐 => 표준화 필요**
- 차원(벡터)의 크기가 크면 계산량이 많아짐 (속도 저하)
- k 값에 따라 분류의 정확도가 달라지므로, 적절한 k를 찾아야 함
- 범주형 데이터 - 원핫 인코딩

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [51]:
df2 = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv")
print(df2.shape)
print(df2.info())

(150, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [52]:
df2.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [54]:
df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal_length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal_width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal_length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal_width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


### 데이터 전처리 (표준화)

In [58]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df2[['sepal_length']] = scaler.fit_transform(df2[['sepal_length']]) 
df2[['sepal_width']] = scaler.fit_transform(df2[['sepal_width']])
df2[['petal_length']] = scaler.fit_transform(df2[['petal_length']])
df2[['petal_width']] = scaler.fit_transform(df2[['petal_width']])
df2.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,-0.900681,1.019004,-1.340227,-1.315444,setosa
1,-1.143017,-0.131979,-1.340227,-1.315444,setosa
2,-1.385353,0.328414,-1.397064,-1.315444,setosa
3,-1.506521,0.098217,-1.283389,-1.315444,setosa
4,-1.021849,1.249201,-1.340227,-1.315444,setosa


### 분석 데이터셋 준비

In [59]:
X = df2.drop(columns='species')
y = df2['species']

print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(150, 4) (150,)
(120, 4) (30, 4)
(120,) (30,)


### 데이터 분석 수행

In [61]:
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train, y_train)

knn6 = KNeighborsClassifier(n_neighbors=6)
knn6.fit(X_train, y_train)

knn9 = KNeighborsClassifier(n_neighbors=9)
knn9.fit(X_train, y_train)

In [62]:
pred3 = knn3.predict(X_test)
pred6 = knn3.predict(X_test)
pred9 = knn3.predict(X_test)

print('k:3 accuracy =', accuracy_score(y_test, pred3))
print('k:6 accuracy =', accuracy_score(y_test, pred6))
print('k:9 accuracy =', accuracy_score(y_test, pred9))

k:3 accuracy = 0.9666666666666667
k:6 accuracy = 0.9666666666666667
k:9 accuracy = 0.9666666666666667


## 3. SVM
- 데이터가 사상된 공간에서 그룹 사이의 거리가 가장 큰 경계를 찾는 알고리즘
- 커널 트릭 사용 : 다양한 특성의 데이터 사용 가능
- 적은 학습 데이터로도 높은 정확도
- 변수가 많은 경우 결정 경계 및 데이터의 시각화가 어려워 분류 결과 이해 어려움
- 범주형 데이터 - 원핫인코딩

In [65]:
from sklearn.svm import SVC

In [64]:
df3 = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
print(df3.shape, df3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
(891, 12) None


In [66]:
# 필요없는 행 삭제
df3.drop(columns=['PassengerId', 'Name', 'Cabin', 'Ticket'], inplace=True)
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


### 결측치 처리

In [67]:
print(df3['Age'].isna().sum())
df3['Age'].fillna(df3['Age'].mean(), inplace=True)
print(df3['Age'].isna().sum())

177
0


In [68]:
print(df3['Embarked'].isna().sum())
df3['Embarked'].fillna(df3['Embarked'].mode()[0], inplace=True)
print(df3['Embarked'].isna().sum())

2
0


### 파생변수 생성

In [69]:
df3['FamilySize'] = df3['SibSp']+df3['Parch']
df3.drop(columns=['SibSp', 'Parch'], inplace=True)
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    object 
 3   Age         891 non-null    float64
 4   Fare        891 non-null    float64
 5   Embarked    891 non-null    object 
 6   FamilySize  891 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


### 레이블 인코딩 : 원핫 인코딩
- 판다스 get_dummies() 함수 사용
- 원핫 인코딩 시 새로운 컬럼 생성 > 데이터프레임과 결합 > 기존 컬럼 삭제

In [72]:
oh_sex = pd.get_dummies(df3['Sex'])
df3 = pd.concat([df3, oh_sex], axis=1)

oh_embarked = pd.get_dummies(df3['Embarked'])
df3 = pd.concat([df3, oh_embarked], axis=1)

df3.drop(columns=['Sex', 'Embarked'], inplace=True)
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         891 non-null    float64
 3   Fare        891 non-null    float64
 4   FamilySize  891 non-null    int64  
 5   female      891 non-null    bool   
 6   male        891 non-null    bool   
 7   C           891 non-null    bool   
 8   Q           891 non-null    bool   
 9   S           891 non-null    bool   
dtypes: bool(5), float64(2), int64(3)
memory usage: 39.3 KB


### 분석 데이터셋 준비 & 분석 수행

In [74]:
X = df3.drop(columns='Survived')
y = df3['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3, stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(623, 9) (268, 9)
(623,) (268,)


In [75]:
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)

In [76]:
pred = svm.predict(X_test)

print(accuracy_score(y_test, pred))

0.6492537313432836


In [77]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[148  17]
 [ 77  26]]
              precision    recall  f1-score   support

           0       0.66      0.90      0.76       165
           1       0.60      0.25      0.36       103

    accuracy                           0.65       268
   macro avg       0.63      0.57      0.56       268
weighted avg       0.64      0.65      0.60       268



### SVM 커널 파라미터 조정
- C : 비용 (ex. 0.01, 0.1, 1, 10, 100, 1000)
- gamma : 허용 표준편차 (커지면 데이터 포인트별로 허용하는 표준편차가 작아지고, 결정 경계도 작아지면서 구부러짐 ex.0.0001, 0.001, 0.01, 0.1, 1)

## 4. 로지스틱 회귀 분류
- 정답이 있는 데이터 사용 분류 작업 수행
- **시그모이드 함수 출력값을 각 분류 항목에 속하게 될 확률값으로 사용.** 0~1 사이의 실수
- 확률에 따라 가능성이 더 높은 범주에 속하는 것으로 분류하는 이진 분류 모델
- 현재 갖고 있는 데이터를 통해 에러를 줄이는 방향으로 weight와 bias의 최적값 찾음
- 범주형 데이터 - 원핫 인코딩

- 규제 유형과 강도에 따라 분류 정확도가 달라짐 (규제는 overfitting 방지 위함)
- **규제 유형 - penalty 매개변수** : L2(default, 릿지 방식), L1(라쏘 방식)
- **규제 강도 - C 매개변수** : 1(default, 작을수록 규제가 강해짐)
- **predict_proba()** : 각 분류 항목에 속할 확률 확인 가능
- **decision_function()** : 모델 학습 선형 방정식 확인 가능, 다중 분류일 경우 각 분류 항목마다 선형 방정식 계산

In [81]:
from sklearn.linear_model import LogisticRegression

df4 = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv")
print(df4.shape)
print(df4.info())

(150, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [82]:
scaler = StandardScaler()

df4[['sepal_length']] = scaler.fit_transform(df4[['sepal_length']])
df4[['sepal_width']] = scaler.fit_transform(df4[['sepal_width']])
df4[['petal_length']] = scaler.fit_transform(df4[['petal_length']])
df4[['petal_width']] = scaler.fit_transform(df4[['petal_width']])

In [83]:
X = df4.drop(columns='species')
y = df4['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(120, 4) (30, 4)
(120,) (30,)


In [84]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [85]:
pred = lr.predict(X_test)

print(accuracy_score(y_test, pred))

0.9666666666666667


**분류 예측 확률 확인**

In [87]:
proba = lr.predict_proba(X_test)
proba[:6]

array([[9.68407744e-01, 3.15917530e-02, 5.02654555e-07],
       [2.47057766e-02, 8.65892734e-01, 1.09401490e-01],
       [9.86807442e-01, 1.31917882e-02, 7.70156575e-07],
       [1.24760709e-03, 5.45002721e-01, 4.53749672e-01],
       [9.73795400e-01, 2.62040565e-02, 5.43260921e-07],
       [2.35323447e-02, 9.01741613e-01, 7.47260420e-02]])

## 5. 랜덤 포레스트
- 다수의 의사결정 트리들을 배깅해 분류 또는 회귀 수행 앙상블 알고리즘
- 각 트리는 전체 학습 데이터 중 서로 다른 데이터를 샘플링 해 일부 데이터를 제외한 후 최적의 특징을 찾아 트리를 분기함
- 트리들이 서로 조금씩 다른 특성을 가져 일반화 성능 향상
- 기본 매개변수 설정만으로 좋은 결과 가능
- 랜덤 포레스트의 특성 중요도는 각 트리의 특성 중요도를 취합한 것
- 범주형 변수 - 레이블 인코딩

In [88]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    int64  
 3   Age         891 non-null    float64
 4   Fare        891 non-null    float64
 5   Embarked    891 non-null    int64  
 6   FamilySize  891 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


(None,
    Survived  Pclass  Sex   Age     Fare  Embarked  FamilySize
 0         0       3    1  22.0   7.2500         2           1
 1         1       1    0  38.0  71.2833         0           1
 2         1       3    0  26.0   7.9250         2           0
 3         1       1    0  35.0  53.1000         2           1
 4         0       3    1  35.0   8.0500         2           0)

In [90]:
X = df.drop(columns='Survived')
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=0.2)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(712, 6) (179, 6)
(712,) (179,)


In [92]:
rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
rf.fit(X_train, y_train)

In [93]:
pred = rf.predict(X_test)
print(accuracy_score(y_test, pred))

0.8044692737430168
