# 타이타닉

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score

In [3]:
# 경고 메세지 감춤
import warnings
warnings.filterwarnings('ignore')

In [4]:
titanic = pd.read_csv('../data/titanic.csv')
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   float64
 1   survived  1309 non-null   float64
 2   name      1309 non-null   object 
 3   sex       1309 non-null   object 
 4   age       1046 non-null   float64
 5   sibsp     1309 non-null   float64
 6   parch     1309 non-null   float64
 7   ticket    1309 non-null   object 
 8   fare      1308 non-null   float64
 9   cabin     295 non-null    object 
 10  embarked  1307 non-null   object 
dtypes: float64(6), object(5)
memory usage: 112.7+ KB


In [6]:
titanic.ticket.value_counts()

CA. 2343    11
1601         8
CA 2144      8
PC 17608     7
347077       7
            ..
373450       1
2223         1
350046       1
3101281      1
315082       1
Name: ticket, Length: 929, dtype: int64

# 데이터 전처리 
1. cabin(결측치↑), ticket(범주↑) 삭제
2. titles(추출 → 컬럼 생성 → 숫자형)
3. 결측치
4. sex, embarked -> 숫자형

In [7]:
# 1. cabin(결측치↑), ticket(범주↑) 삭제
titanic.drop('cabin', axis=1, inplace=True)
titanic.drop('ticket', axis=1, inplace=True)

In [8]:
# 2. titles

# 추출 → 컬럼 생성
fmt = ' ([A-Za-z]+)\.'
titanic['titles'] = titanic.name.str.extract(fmt)
titanic.drop('name', axis=1, inplace=True)

titanic.titles.value_counts()

# 컬럼 생성 → 숫자형
encoder = LabelEncoder()

titles = titanic.titles
encoder.fit(titles)
titanic['Titles'] = encoder.transform(titles)

In [9]:
# age 결측치 대체
age_mean = np.mean(titanic.age)
titanic.age.fillna(age_mean, inplace=True)

# 결측치 삭제
titanic.dropna(inplace=True)

In [10]:
# 3. sex, embarked -> 숫자형

# 성별
encoder = LabelEncoder()

gender = titanic.sex
encoder.fit(gender)
titanic['gender'] = encoder.transform(gender)

# 승선위치
encoder = LabelEncoder()

embarked = titanic.embarked
encoder.fit(embarked)
titanic['Embarked'] = encoder.transform(embarked)

In [11]:
# 확인
titanic.loc[:, ['sex', 'gender', 'embarked', 'Embarked', 'titles', 'Titles']]

Unnamed: 0,sex,gender,embarked,Embarked,titles,Titles
0,female,0,S,2,Miss,10
1,male,1,S,2,Master,9
2,female,0,S,2,Miss,10
3,male,1,S,2,Mr,13
4,female,0,S,2,Mrs,14
...,...,...,...,...,...,...
1304,female,0,C,0,Miss,10
1305,female,0,C,0,Miss,10
1306,male,1,C,0,Mr,13
1307,male,1,C,0,Mr,13


# 데이터 분할

In [12]:
# feature, target 추출
data = titanic.iloc[:, [0, 3, 4, 5, 9, 10, 11]]
target = titanic.survived

In [13]:
data.head()

Unnamed: 0,pclass,age,sibsp,parch,Titles,gender,Embarked
0,1.0,29.0,0.0,0.0,10,0,2
1,1.0,0.9167,1.0,2.0,9,1,2
2,1.0,2.0,1.0,2.0,10,0,2
3,1.0,30.0,1.0,2.0,13,1,2
4,1.0,25.0,1.0,2.0,14,0,2


# 데이터 분할

In [14]:
xtrain, xtest, ytrain, ytest = train_test_split(data, target, test_size=0.3,
                     random_state=2208241055)

# 데이터 스케일링

In [39]:
ss = StandardScaler()

xtrain_scaled = ss.fit_transform(xtrain)    # 훈련 데이터 → 표준화
xtest_scaled = ss.transform(xtest)          # 테스트 데이터 → 표준화

In [38]:
pd.DataFrame(xtrain_scaled).describe()

Unnamed: 0,0,1,2,3,4,5,6
count,914.0,914.0,914.0,914.0,914.0,914.0,914.0
mean,-5.053094e-17,-3.498296e-17,5.441793000000001e-17,-2.526547e-17,3.78982e-16,-9.717487999999999e-19,-1.9434980000000002e-17
std,1.000547,1.000547,1.000547,1.000547,1.000547,1.000547,1.000547
min,-1.541324,-2.322018,-0.4777601,-0.4632424,-6.203217,-1.352672,-1.852319
25%,-0.3464384,-0.6033679,-0.4777601,-0.4632424,-1.25388,-1.352672,-0.6165383
50%,0.8484472,0.0170108,-0.4777601,-0.4632424,0.3958988,0.7392776,0.6192424
75%,0.8484472,0.4199522,0.461321,-0.4632424,0.3958988,0.7392776,0.6192424
max,0.8484472,3.489912,7.034889,9.976844,2.045678,0.7392776,0.6192424


In [31]:
pd.DataFrame(xtest_scaled).describe()

Unnamed: 0,0,1,2,3,4,5,6
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,0.025439,0.042445,-0.027384,-0.051912,-0.050215,-0.013184,-0.02702
std,1.000317,1.034057,0.928012,1.015606,1.094223,1.005232,1.022711
min,-1.541324,-2.256423,-0.47776,-0.463242,-6.753143,-1.352672,-1.852319
25%,-0.346438,-0.603368,-0.47776,-0.463242,-1.25388,-1.352672,-0.616538
50%,0.848447,0.017011,-0.47776,-0.463242,0.395899,0.739278,0.619242
75%,0.848447,0.419952,0.461321,-0.463242,0.395899,0.739278,0.619242
max,0.848447,3.962214,7.034889,9.976844,2.595604,0.739278,0.619242


# 모델 학습&평가

In [16]:
# 결정 트리
dtclf = DecisionTreeClassifier()    # 분류 모델 생성
dtclf.fit(xtrain, ytrain)         # 분류 모델 학습

ypred = dtclf.predict(xtest)        # 예측값
accuracy_score(ytest, ypred)        # 비교(예측값 - 분류 모델 학습)

0.7678571428571429

In [17]:
# 서포트 벡터
svclf = SVC(kernel='linear')    # 분류 모델 생성
svclf.fit(xtrain_scaled, ytrain)         # 분류 모델 학습

ypred = svclf.predict(xtest_scaled)        # 예측값
accuracy_score(ytest, ypred)        # 비교(예측값 - 분류 모델 학습)

0.7397959183673469

In [18]:
# 로지스틱 회귀
lrclf = LogisticRegression()    # 분류 모델 생성
lrclf.fit(xtrain_scaled, ytrain)         # 분류 모델 학습

ypred = lrclf.predict(xtest_scaled)        # 예측값
accuracy_score(ytest, ypred)        # 비교(예측값 - 분류 모델 학습)

0.7576530612244898

In [19]:
# 로지스틱 회귀(스케일링)
lrclf = LogisticRegression()    # 분류 모델 생성
lrclf.fit(xtrain_scaled, ytrain)         # 분류 모델 학습

ypred = lrclf.predict(xtest_scaled)        # 예측값
accuracy_score(ytest, ypred)        # 비교(예측값 - 분류 모델 학습)

0.7576530612244898

In [20]:
# KNN
knn = KNeighborsClassifier()    # 분류 모델 생성
knn.fit(xtrain, ytrain)         # 분류 모델 학습

ypred = knn.predict(xtest)        # 예측값
accuracy_score(ytest, ypred)        # 비교(예측값 - 분류 모델 학습)

0.7193877551020408

In [21]:
# KNN(스케일링) - 정확도↑
knn = KNeighborsClassifier()    # 분류 모델 생성
knn.fit(xtrain_scaled, ytrain)         # 분류 모델 학습

ypred = knn.predict(xtest_scaled)        # 예측값
accuracy_score(ytest, ypred)        # 비교(예측값 - 분류 모델 학습)

0.7653061224489796

# 교차검증

In [22]:
# 결정 트리 - 정확도↓
dtclf = DecisionTreeClassifier()
scores = cross_val_score(dtclf, xtrain, ytrain, cv=10, scoring='accuracy')

print(scores)
np.mean(scores)

[0.80434783 0.75       0.77173913 0.81521739 0.82417582 0.75824176
 0.73626374 0.63736264 0.81318681 0.73626374]


0.7646798853320592

In [23]:
# 서포트 벡터
svclf = SVC()
svclf = cross_val_score(svclf, xtrain_scaled, ytrain, cv=10, scoring='accuracy')

print(scores)
np.mean(scores)

[0.80434783 0.75       0.77173913 0.81521739 0.82417582 0.75824176
 0.73626374 0.63736264 0.81318681 0.73626374]


0.7646798853320592

In [24]:
# 로지스틱 회귀(스케일링) - 정확도↑
lrclf = LogisticRegression()
lrclf = cross_val_score(lrclf, xtrain_scaled, ytrain, cv=10, scoring='accuracy')

print(scores)
np.mean(scores)

[0.80434783 0.75       0.77173913 0.81521739 0.82417582 0.75824176
 0.73626374 0.63736264 0.81318681 0.73626374]


0.7646798853320592

In [25]:
# KNN(스케일링) - 정확도↑
knnclf = KNeighborsClassifier()
scores = cross_val_score(knnclf, xtrain_scaled, ytrain, cv=10, scoring='accuracy')

print(scores)
np.mean(scores)

[0.84782609 0.82608696 0.80434783 0.85869565 0.79120879 0.85714286
 0.76923077 0.74725275 0.84615385 0.86813187]


0.8216077400860009

# 정확도

- ## 결정 트리
표준화↓, 교차검증↓
- ## 로지스틱 회귀, 서포트 벡터
표준화(변화X), 교차검증↑
- ## KNN
표준화↑, 교차검증↑