In [811]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn

# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [812]:
# 1. 데이터 불러오기
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

In [813]:
# 2. 데이터 확인
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [814]:
df.shape

(891, 12)

In [815]:
# 3. 결측치 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [816]:
# 4. 요약통계량
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [817]:
# 5. 결측치 전처리

d_mean = df['Age'].mean() # 평균값 대체
df['Age'].fillna(d_mean, inplace=True)

In [818]:
d_mode = df['Embarked'].mode()[0] # 최빈값 대체
df['Embarked'].fillna(d_mode, inplace=True)

In [819]:
df['Embarked'].mode()[0]

'S'

In [820]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [821]:
df['Embarked'].value_counts()

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [822]:
df['Embarked'].isnull().sum()

0

In [823]:
df['Cabin']

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [824]:
df.drop(columns='Cabin', inplace=True)

In [825]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [826]:
df['Sex']

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [827]:
df['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [828]:
# Sex >> one-hot-encoding (1, 0으로 인코딩)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() # 라벨인코더 (sex: 0,1, Embaked : 0,1,2)

df['le_sex'] = le.fit_transform(df['Sex'])
df['le_embarked'] = le.fit_transform(df['Embarked'])

onehot_sex = pd.get_dummies(df['Sex'])
onehot_embarked =  pd.get_dummies(df['Embarked'])

# df = pd.concat([df, df['le_sex'], df['le_embarked']], axis=1)
df = pd.concat([df,onehot_sex, onehot_embarked], axis=1)

In [829]:
df.columns


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'le_sex', 'le_embarked',
       'female', 'male', 'C', 'Q', 'S'],
      dtype='object')

In [830]:
# SibSp, Parch 값을 더해서 >> FamilySize 파생변수 생성
df['FamilySize'] = df['SibSp'] + df['Parch']
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,le_sex,le_embarked,female,male,C,Q,S,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,S,1,2,False,True,False,False,True,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C,0,0,True,False,True,False,False,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,S,0,2,True,False,False,False,True,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,S,0,2,True,False,False,False,True,1
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,S,1,2,False,True,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,S,1,2,False,True,False,False,True,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,S,0,2,True,False,False,False,True,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,S,0,2,True,False,False,False,True,3
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C,1,0,False,True,True,False,False,0


In [831]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'le_sex', 'le_embarked',
       'female', 'male', 'C', 'Q', 'S', 'FamilySize'],
      dtype='object')

In [832]:
# 6. 분석 데이터 준비
# x (독립변수), y(종속변수)

# x = df[['PassengerId','Age', 'Fare','le_sex', 'le_embarked', 'FamilySize']]
x = df[['PassengerId','Age', 'Fare','female', 'male', 'C', 'Q', 'S', 'FamilySize']]
y = df['Survived']

In [833]:
x[:5]

Unnamed: 0,PassengerId,Age,Fare,female,male,C,Q,S,FamilySize
0,1,22.0,7.25,False,True,False,False,True,1
1,2,38.0,71.2833,True,False,True,False,False,1
2,3,26.0,7.925,True,False,False,False,True,0
3,4,35.0,53.1,True,False,False,False,True,1
4,5,35.0,8.05,False,True,False,False,True,0


In [834]:
y[:5]

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [835]:
# 7. 분석 데이터 분할(8:2)
x_train, x_test, y_train, y_test =\
train_test_split(x,y, test_size=0.2, random_state=42)

In [836]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(712, 9)
(179, 9)
(712,)
(179,)


모델링

In [837]:
# 모델 1) 의사결정나무

# 8. 모델 훈련
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train, y_train) # 학습 수행

In [838]:
# 9. 훈련된 모델로 예측
dt.predict(x_test)
pred = dt.predict(x_test)
pred

array([0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0])

In [839]:
# 10. 모델성능 - 정확도 측정
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

# le :  73.18% (v)
# onehot : 70.94%

0.7094972067039106


In [840]:
# 모델 2) Support Vector Machine

from sklearn import svm

In [841]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'le_sex', 'le_embarked',
       'female', 'male', 'C', 'Q', 'S', 'FamilySize'],
      dtype='object')

In [842]:
# 분석 데이터 셋 준비
# x(독립변수), y(종속변수)

x = df[['PassengerId','Age', 'Fare', 'female', 'male', 'C', 'Q', 'S']]
y = df['Survived']

In [843]:
# 분석 데이터셋 분할 (7:3)

x_train, x_test, y_train, y_test =\
train_test_split(x,y, test_size=0.3, random_state=42)

In [844]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(623, 8)
(268, 8)
(623,)
(268,)


In [845]:
# svm 객체 생성
#svc = svm.SVC(kernel='linear')
svc = svm.SVC(kernel='rbf')
svc.fit(x_train, y_train)

In [846]:
pred = svc.predict(x_test)

In [847]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, pred)
print(acc)

# svc_linear : 79.10%
# svc_rbf : 60.44%

0.6044776119402985


In [848]:
# 모델 성능평가 - confusion matrix

from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, pred)
print(cm)

[[152   5]
 [101  10]]


In [849]:
rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

           0       0.60      0.97      0.74       157
           1       0.67      0.09      0.16       111

    accuracy                           0.60       268
   macro avg       0.63      0.53      0.45       268
weighted avg       0.63      0.60      0.50       268



In [850]:
# 모델 3) RandomForest
from sklearn.ensemble import RandomForestClassifier

In [851]:
# 분석 데이터 준비
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'le_sex', 'le_embarked',
       'female', 'male', 'C', 'Q', 'S', 'FamilySize'],
      dtype='object')

In [852]:
# x = df[['PassengerId', 'le_sex', 'le_embarked','FamilySize' ]]
x = df[['PassengerId','female', 'male', 'C', 'Q', 'S','FamilySize' ]]
y = df['Survived']

In [853]:
x, y

(     PassengerId  female   male      C      Q      S  FamilySize
 0              1   False   True  False  False   True           1
 1              2    True  False   True  False  False           1
 2              3    True  False  False  False   True           0
 3              4    True  False  False  False   True           1
 4              5   False   True  False  False   True           0
 ..           ...     ...    ...    ...    ...    ...         ...
 886          887   False   True  False  False   True           0
 887          888    True  False  False  False   True           0
 888          889    True  False  False  False   True           3
 889          890   False   True   True  False  False           0
 890          891   False   True  False   True  False           0
 
 [891 rows x 7 columns],
 0      0
 1      1
 2      1
 3      1
 4      0
       ..
 886    0
 887    1
 888    0
 889    1
 890    0
 Name: Survived, Length: 891, dtype: int64)

In [854]:
# 분석 데이터 셋 분할 (8: 2)
x_train, x_test, y_train, y_test =\
train_test_split(x, y, test_size=0.2, random_state=42)

In [855]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(712, 7)
(179, 7)
(712,)
(179,)


In [856]:
# 랜덤포레스트 객체 생성
rf = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42)

In [857]:
# 모델 학습
rf.fit(x_train, y_train)

In [858]:
# 예측 수행
pred = rf.predict(x_test)
pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1])

In [859]:
# 모델 성능 비교 - 정확도 측정

acc = accuracy_score(y_test, pred)
print(acc)
# le 적용시, acc = 79.8%
# ohe 적용시, acc = 79.3%

0.7932960893854749


In [860]:
# 모델 4) K 최근접 이웃 (KNN)

from sklearn.neighbors import KNeighborsClassifier

In [861]:
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv")

In [862]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [863]:
df.shape

(150, 5)

In [864]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [865]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [866]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [867]:
# 각 독립변수별 min-max 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])
df[['sepal_width']] = scaler.fit_transform(df[['sepal_width']])

df[['petal_length']] = scaler.fit_transform(df[['petal_length']])
df[['petal_width']] = scaler.fit_transform(df[['petal_width']])


In [868]:
df['species'] = LabelEncoder().fit_transform(df['species'])

In [869]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0
2,0.111111,0.5,0.050847,0.041667,0
3,0.083333,0.458333,0.084746,0.041667,0
4,0.194444,0.666667,0.067797,0.041667,0


In [870]:
# 분석데이터셋 준비

df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [871]:
x = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df['species']

In [872]:
# 분석 데이터 분할
x_train, x_test, y_train, y_test =\
train_test_split(x,y, test_size=0.2, random_state=42)

In [873]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(120, 4)
(30, 4)
(120,)
(30,)


모델링

In [874]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)

In [875]:
pred = knn.predict(x_test)

In [876]:
y_test[:5]

73     1
18     0
118    2
78     1
76     1
Name: species, dtype: int64

In [877]:
pred[:5]

array([1, 0, 2, 1, 1])

In [878]:
acc = accuracy_score(y_test, pred)
print(acc)

1.0


In [879]:
# 추가 분석
# 모델 성능 평가
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)

array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  0, 11]])

In [880]:
from sklearn.metrics import classification_report
rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [881]:
# 모델 5) 로지스틱 회귀분석 (Logistic Regression)

from sklearn.linear_model import LogisticRegression

In [882]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625000,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0
2,0.111111,0.500000,0.050847,0.041667,0
3,0.083333,0.458333,0.084746,0.041667,0
4,0.194444,0.666667,0.067797,0.041667,0
...,...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667,2
146,0.555556,0.208333,0.677966,0.750000,2
147,0.611111,0.416667,0.711864,0.791667,2
148,0.527778,0.583333,0.745763,0.916667,2


In [883]:
# LogisticRegrssion 객체 생성
lr = LogisticRegression()

In [884]:
# 학습 수행
lr.fit(x_train, y_train)

In [885]:
# 예측 수행
pred = lr.predict(x_test)

In [886]:
# 모델 성능 - 정확도
acc = accuracy_score(y_test, pred)
print(acc)
# acc : 96.6%

0.9666666666666667
