# 의사결정 트리(Decision Tree)

In [494]:
import pandas as pd
import numpy as np
import sklearn
#의사결정나무 분류모델을 위한 패키지 임포트
from sklearn.tree import DecisionTreeClassifier
#학습 및 테스트 데이터셋 분리를 패키지 임포트
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [495]:
#데이터 탐색
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [496]:
# 범주형 값 바꾸기(인코딩)
df['species'] = df['species'].replace({'setosa':0, 'versicolor':1, 'virginica':2})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


  df['species'] = df['species'].replace({'setosa':0, 'versicolor':1, 'virginica':2})


In [497]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [498]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [499]:
# 분석 데이터셋 준비 (독립변수, 종속변수 분리)
X = df[['sepal_length','sepal_width','petal_length','petal_width']]
y = df['species']

In [500]:
# 데이터셋 분리(학습용, 테스트용 = 8:2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)

In [501]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(120, 4) (30, 4) (120,) (30,)


In [502]:
# 의사결정트리(DecisionTreeClassifier) 객체 생성(모델 선택)
dt = DecisionTreeClassifier(random_state = 11)

# 학습 수행
dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,11
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [503]:
# 에측 수행
pred = dt.predict(X_test)
print('실제값:',y_test.values)
print('예측값:',pred)

실제값: [2 2 2 1 2 0 1 0 0 1 2 1 1 2 2 0 2 1 2 2 1 0 0 1 0 0 2 1 0 1]
예측값: [2 2 1 1 2 0 1 0 0 1 1 1 1 2 2 0 2 1 2 2 1 0 0 1 0 0 2 1 0 1]


In [504]:
# from sklearn.metrics import accuracy_score
import sklearn.metrics
acc = sklearn.metrics.accuracy_score(y_test, pred)
rpt = sklearn.metrics.classification_report(y_test, pred)
print(acc,'\n', rpt)

0.9333333333333333 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.83      1.00      0.91        10
           2       1.00      0.82      0.90        11

    accuracy                           0.93        30
   macro avg       0.94      0.94      0.94        30
weighted avg       0.94      0.93      0.93        30



In [505]:
print(sklearn.metrics.precision_score(y_test, pred, average = 'macro'))
print(sklearn.metrics.recall_score(y_test, pred, average = 'macro'))

0.9444444444444445
0.9393939393939394


In [506]:
# 오차행렬(혼동행렬)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)

array([[ 9,  0,  0],
       [ 0, 10,  0],
       [ 0,  2,  9]])

In [507]:
df2 = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
df2.info()
df2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [508]:
# 결측치 확인
df2.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [509]:
# Age 컬럼 결측치 처리(평균)
age_mean = df2['Age'].mean()
df2['Age'] = df2['Age'].fillna(age_mean).astype(int)
df2.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [510]:
# Embarked 컬럼 결측치 처리(최빈값)
em_mod = df2['Embarked'].mode()[0]
df2['Embarked'] = df2['Embarked'].fillna(em_mod)
df2.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [511]:
# 알파펫을 특정 숫자로 변환
from sklearn.preprocessing import LabelEncoder
# X2['Sex'] = X2['Sex'].replace({'male':0, 'female':1})
df2['Sex'] = LabelEncoder().fit_transform(df2['Sex'])
df2['Embarked'] = LabelEncoder().fit_transform(df2['Embarked'])
df2.info()
df2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    int64  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    int64  
dtypes: float64(1), int64(8), object(3)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.2500,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.9250,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.1000,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.0500,,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27,0,0,211536,13.0000,,2
887,888,1,1,"Graham, Miss. Margaret Edith",0,19,0,0,112053,30.0000,B42,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,29,1,2,W./C. 6607,23.4500,,2
889,890,1,1,"Behr, Mr. Karl Howell",1,26,0,0,111369,30.0000,C148,0


In [512]:
# 형제자매수 + 부모님수 = 전체가족수
df2['FamilySize'] = df2['SibSp'] + df2['Parch']
df2.info()
df2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    int64  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    int64  
 12  FamilySize   891 non-null    int64  
dtypes: float64(1), int64(9), object(3)
memory usage: 90.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.2500,,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,C85,0,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.9250,,2,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.1000,C123,2,1
4,5,0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.0500,,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27,0,0,211536,13.0000,,2,0
887,888,1,1,"Graham, Miss. Margaret Edith",0,19,0,0,112053,30.0000,B42,2,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,29,1,2,W./C. 6607,23.4500,,2,3
889,890,1,1,"Behr, Mr. Karl Howell",1,26,0,0,111369,30.0000,C148,0,0


In [513]:
# 독립변수 테이블 설정(X2)
X2 = df2[['Pclass','Sex','Age','Fare','Embarked','FamilySize']]
X2.info()
X2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      891 non-null    int64  
 1   Sex         891 non-null    int64  
 2   Age         891 non-null    int64  
 3   Fare        891 non-null    float64
 4   Embarked    891 non-null    int64  
 5   FamilySize  891 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 41.9 KB


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,3,1,22,7.2500,2,1
1,1,0,38,71.2833,0,1
2,3,0,26,7.9250,2,0
3,1,0,35,53.1000,2,1
4,3,1,35,8.0500,2,0
...,...,...,...,...,...,...
886,2,1,27,13.0000,2,0
887,1,0,19,30.0000,2,0
888,3,0,29,23.4500,2,3
889,1,1,26,30.0000,0,0


In [514]:
# 종속변수 테이블 설정(y2)
y2 = df2['Survived']
y2

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [515]:
# 상관관계 분석 함수
def correlation_analysis(X, y):
    data = X.copy()
    data['target'] = y
    correlation = data.corr()['target'].drop('target')
    return correlation.abs().sort_values(ascending=False)
correlation_results = correlation_analysis(X2, y2)
correlation_results

Sex           0.543351
Pclass        0.338481
Fare          0.257307
Embarked      0.167675
Age           0.067809
FamilySize    0.016639
Name: target, dtype: float64

In [516]:
# 상관관계 분석 결과를 적용해 의미있는 변수들로만 재설정
X2 = df2[['Sex','Pclass','Embarked']]

In [517]:
# 데이터셋 분리(학습용, 테스트용 = 8:2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 11)
print(X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape)
# 의사결정트리(DecisionTreeClassifier) 객체 생성(모델 선택)
dt2 = DecisionTreeClassifier(random_state = 11)
# 학습 수행
dt2.fit(X_train2, y_train2)

(712, 3) (179, 3) (712,) (179,)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,11
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [518]:
# 에측 수행
pred2 = dt2.predict(X_test2)
print('실제값:',y_test2.values)
print('예측값:',pred2)

# numpy 배열로 변환 후 비교
arr1 = np.array(y_test2.values)
arr2 = np.array(pred2)

result = (arr1 == arr2).astype(int)  # 같으면 True → 1, 다르면 False → 0
print('결과값',result)  # [1 0 1 0]
A = len(result)
T = result.sum()
F = A - T
perc = T/A
print(f'정확도 : {perc*100:0.2f}%')

실제값: [1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0
 0 0 1 0 0 1 1 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0 0 1
 0 0 1 0 1 1 1 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0
 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0
 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1]
예측값: [0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0
 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0
 1 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]
결과값 [0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 1 1
 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1
 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [519]:
rpt2 = sklearn.metrics.classification_report(y_test2, pred2)
print('report\n',rpt2)
print('confusion_matrix\n',confusion_matrix(y_test2, pred2))
# macro avg = 모든 클래스를 똑같이 취급 (클래스 불균형에 민감)
# weighted avg = 데이터가 많은 클래스의 성능이 더 크게 반영됨 (실제 분포 반영)

report
               precision    recall  f1-score   support

           0       0.84      0.97      0.90       118
           1       0.91      0.64      0.75        61

    accuracy                           0.85       179
   macro avg       0.87      0.80      0.82       179
weighted avg       0.86      0.85      0.85       179

confusion_matrix
 [[114   4]
 [ 22  39]]


In [520]:
# 신규 데이터 예측
new_passenger = pd. DataFrame({
'Sex': [1, 0, 1],
'Pclass': [1, 3, 2],
# 'Fare': [80, 10, 30],
'Embarked': [0, 1, 0]
})
pred3 = dt2.predict(new_passenger)
pred3
for i, p3 in enumerate(pred3,1):
    if p3 == 1:
        print(f"승객 {i}번 : 생존")
    else:
        print(f"승객 {i}번 : 사망")

승객 1번 : 사망
승객 2번 : 생존
승객 3번 : 사망
