# 타이타닉 생존자 예측 경진대회 모델링

# 1. 라이브러리 임포트
- 가장 먼저, 기본적인 라이브러리를 불러옵니다.

In [48]:
# 데이터 분석을 위한 라이브러리
import numpy as np
import pandas as pd

# 2. 데이터 불러오기
- `pd.read_csv` : csv 파일을 불러오는 메서드

In [49]:
# 훈련 데이터, 테스트 데이터 불러오기
train = pd.read_csv('C:\\vscode\\kaggle\\dataset\\titanic_dat\\train.csv')
test = pd.read_csv('C:\\vscode\\kaggle\\dataset\\titanic_dat\\test.csv')
submission = pd.read_csv('C:\\vscode\\kaggle\\dataset\\titanic_dat\\gender_submission.csv')

In [50]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [51]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# 3. 피처 엔지니어링

#### **<font color='orange'>PassengerId는 순번을 나타내는 값으로 아무 의미 없는 데이터이므로, 모델 훈련할 때는 제거</font>**

In [52]:
train = train.drop('PassengerId', axis=1)  # axis = 1 열방향으로 적용 (삭제)
test = test.drop('PassengerId', axis=1)

In [53]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### **<font color='orange'>Fare의 결측값은, Pclass를 기준으로 그룹화하여 Fare의 평균으로 대체 </font>**

In [54]:
test['Fare'].isnull().sum()

1

In [55]:
test.loc[test['Fare'].isnull()]

#행이나 열접근시 => loc -> label값으로 접근 / iloc -> indexing 값으로 접근

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [56]:
test.groupby('Pclass').agg({'Fare': 'mean'})

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,94.280297
2,22.202104
3,12.459678


In [57]:
# 테스트 데이터에서 Fare 결측값은 Pclass가 3이므로, 이 결측값을 Pclass의 운임 중앙값으로 치환
test.loc[test['Fare'].isnull(), 'Fare'] = 12.5

In [58]:
test.loc[test['Fare'].isnull()]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


#### **<font color='orange'>Name에서 Title 추출</font>**
#### **<font color='orange'>그다음 Name 피처 삭제</font>**

In [59]:
train['Title'] = train['Name'].str.extract('([A-Za-z]+)\.')
test['Title'] = test['Name'].str.extract('([A-Za-z]+)\.')

In [60]:
# 너무 많은 타이틀이 있으므로 적절히 4개 그룹으로 묶기
train['Title'] = train['Title'].replace(
    ['Mlle', 'Mme',  'Ms',   'Dr', 'Major', 'Sir', 'Capt', 'Lady', 'Countess', 'Jonkheer', 'Col',   'Rev',   'Don',   'Dona'],
    ['Miss', 'Miss', 'Miss', 'Mr', 'Mr',    'Mr',  'Mr',   'Mrs',  'Mrs',      'Other',    'Other', 'Other', 'Other', 'Other']
)

test['Title'] = test['Title'].replace(
    ['Mlle', 'Mme',  'Ms',   'Dr', 'Major', 'Sir', 'Capt', 'Lady', 'Countess', 'Jonkheer', 'Col',   'Rev',   'Don',   'Dona'],
    ['Miss', 'Miss', 'Miss', 'Mr', 'Mr',    'Mr',  'Mr',   'Mrs',  'Mrs',      'Other',    'Other', 'Other', 'Other', 'Other']
)

In [61]:
train = train.drop('Name', axis=1)
test = test.drop('Name', axis=1)

#### **<font color='orange'> 우선, Age 결측값 처리 필요</font>**
#### **<font color='orange'>Age_Group 피처를 만들고, 기존에 있던 Age 피처는 제거</font>**

In [62]:
train.groupby('Title').agg({'Age': 'mean'})

Unnamed: 0_level_0,Age
Title,Unnamed: 1_level_1
Master,4.574167
Miss,21.86
Mr,32.721814
Mrs,35.981818
Other,45.3


In [63]:
train.loc[(train['Age'].isnull()) & (train['Title']=='Master'), 'Age'] = 5
train.loc[(train['Age'].isnull()) & (train['Title']=='Miss'), 'Age'] = 22
train.loc[(train['Age'].isnull()) & (train['Title']=='Mr'), 'Age'] = 33
train.loc[(train['Age'].isnull()) & (train['Title']=='Mrs'), 'Age'] = 36
train.loc[(train['Age'].isnull()) & (train['Title']=='Other'), 'Age'] = 45

#나이에 결측값인 사람 + 타이틀 5

In [64]:
test.loc[(test['Age'].isnull()) & (test['Title']=='Master'), 'Age'] = 5
test.loc[(test['Age'].isnull()) & (test['Title']=='Miss'), 'Age'] = 22
test.loc[(test['Age'].isnull()) & (test['Title']=='Mr'), 'Age'] = 33
test.loc[(test['Age'].isnull()) & (test['Title']=='Mrs'), 'Age'] = 36
test.loc[(test['Age'].isnull()) & (test['Title']=='Other'), 'Age'] = 45

In [65]:
bins = [0, 5, 12, 18, 24, 35, 60, np.inf] # 카테고리로 나눌 구간
labels = ['Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior'] # 카테고리명

train['Age_Group'] = pd.cut(train['Age'], bins=bins, labels=labels)
test['Age_Group'] = pd.cut(test['Age'], bins=bins, labels=labels)

In [66]:
train = train.drop('Age', axis=1)
test = test.drop('Age', axis=1)

In [67]:
train

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Age_Group
0,0,3,male,1,0,A/5 21171,7.2500,,S,Mr,Student
1,1,1,female,1,0,PC 17599,71.2833,C85,C,Mrs,Adult
2,1,3,female,0,0,STON/O2. 3101282,7.9250,,S,Miss,Young Adult
3,1,1,female,1,0,113803,53.1000,C123,S,Mrs,Young Adult
4,0,3,male,0,0,373450,8.0500,,S,Mr,Young Adult
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,211536,13.0000,,S,Other,Young Adult
887,1,1,female,0,0,112053,30.0000,B42,S,Miss,Student
888,0,3,female,1,2,W./C. 6607,23.4500,,S,Miss,Student
889,1,1,male,0,0,111369,30.0000,C148,C,Mr,Young Adult


#### **<font color='orange'>Ticket 피처는 제거</font>**

In [68]:
train = train.drop('Ticket', axis=1)
test = test.drop('Ticket', axis=1)

#### **<font color='orange'>Fare_Group으로 피처를 그룹화한 뒤, 기존에 있던 Fare 피처는 제거</font>**

In [69]:
# Fare를 4개의 카테고리 구간으로 나눔
bins = [-np.inf, 8, 14, 31, np.inf] # 카테고리로 나눌 구간
labels = ['low', 'medium', 'high', 'super-high'] # 카테고리명

train['Fare_Group'] = pd.cut(train['Fare'], bins=bins, labels=labels)
test['Fare_Group'] = pd.cut(test['Fare'], bins=bins, labels=labels)

# 그루핑해서 새로운 피처 만들면 굳이 기존 피처 제거해야하는가?
# 다중공선성
# 서로 상관관계 높은 피처들 제거 

In [70]:
train = train.drop('Fare', axis=1)
test = test.drop('Fare', axis=1)

#### **<font color='orange'>결측값이 너무 많아 Cabin 피처는 제거하는 게 바람직하다.</font>**

In [71]:
train = train.drop('Cabin', axis=1)
test = test.drop('Cabin', axis=1)

#### **<font color='orange'>Embarked 피처의 결측값을 S로 대체하는 게 적절하다.</font>**

In [72]:
train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')

In [73]:
# 원본데이터를 모델링시 최대한 잘 되도록 정제하는 과정 
# EDA시 전략 짜놓고 하나씩 해결하기 
train

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,Age_Group,Fare_Group
0,0,3,male,1,0,S,Mr,Student,low
1,1,1,female,1,0,C,Mrs,Adult,super-high
2,1,3,female,0,0,S,Miss,Young Adult,low
3,1,1,female,1,0,S,Mrs,Young Adult,super-high
4,0,3,male,0,0,S,Mr,Young Adult,medium
...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,S,Other,Young Adult,medium
887,1,1,female,0,0,S,Miss,Student,high
888,0,3,female,1,2,S,Miss,Student,high
889,1,1,male,0,0,C,Mr,Young Adult,high


#### **<font color='orange'>머신러닝 모델은 숫자값만 인식하므로, 문자열은 숫자로 바꿔주기(데이터 인코딩)</font>**

In [74]:
from sklearn.preprocessing import OrdinalEncoder

# 문자열 피처
string_features = ['Sex', 'Embarked', 'Title', 'Age_Group', 'Fare_Group']

# 인코더 생성
ordinal_encoder = OrdinalEncoder()

# 훈련 데이터에서 인코딩 피팅
ordinal_encoder.fit(train[string_features])

# 훈련 데이터, 테스트 데이터에 인코딩 적용
train[string_features] = ordinal_encoder.transform(train[string_features])
test[string_features] = ordinal_encoder.transform(test[string_features])

In [75]:
train

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,Age_Group,Fare_Group
0,0,3,1.0,1,0,2.0,2.0,4.0,1.0
1,1,1,0.0,1,0,0.0,3.0,0.0,3.0
2,1,3,0.0,0,0,2.0,1.0,6.0,1.0
3,1,1,0.0,1,0,2.0,3.0,6.0,3.0
4,0,3,1.0,0,0,2.0,2.0,6.0,2.0
...,...,...,...,...,...,...,...,...,...
886,0,2,1.0,0,0,2.0,4.0,6.0,2.0
887,1,1,0.0,0,0,2.0,1.0,4.0,0.0
888,0,3,0.0,1,2,2.0,1.0,4.0,0.0
889,1,1,1.0,0,0,0.0,2.0,6.0,0.0


In [76]:
test
# 훈련 vs 테스트 / 타겟(survived) 유무 

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Title,Age_Group,Fare_Group
0,3,1.0,0,0,1.0,2.0,6.0,1.0
1,3,0.0,1,0,2.0,3.0,0.0,1.0
2,2,1.0,0,0,1.0,2.0,3.0,2.0
3,3,1.0,0,0,2.0,2.0,6.0,2.0
4,3,0.0,1,1,2.0,3.0,4.0,2.0
...,...,...,...,...,...,...,...,...
413,3,1.0,0,0,2.0,2.0,6.0,2.0
414,1,0.0,0,0,0.0,4.0,0.0,3.0
415,3,1.0,0,0,2.0,2.0,0.0,1.0
416,3,1.0,0,0,2.0,2.0,6.0,2.0


# 4. 모델링

In [77]:
X_train = train.drop('Survived', axis=1) # 피처
y_train = train['Survived'] # 타깃값

In [78]:
X_train

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Title,Age_Group,Fare_Group
0,3,1.0,1,0,2.0,2.0,4.0,1.0
1,1,0.0,1,0,0.0,3.0,0.0,3.0
2,3,0.0,0,0,2.0,1.0,6.0,1.0
3,1,0.0,1,0,2.0,3.0,6.0,3.0
4,3,1.0,0,0,2.0,2.0,6.0,2.0
...,...,...,...,...,...,...,...,...
886,2,1.0,0,0,2.0,4.0,6.0,2.0
887,1,0.0,0,0,2.0,1.0,4.0,0.0
888,3,0.0,1,2,2.0,1.0,4.0,0.0
889,1,1.0,0,0,0.0,2.0,6.0,0.0


In [87]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [79]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤포레스트 모델 생성
randomforest = RandomForestClassifier(random_state=42)

In [80]:
# 모델 훈련
randomforest.fit(X_train, y_train)

# 5. 예측

In [81]:
# 테스트 데이터를 활용해 타깃값 예측
y_pred = randomforest.predict(test)

In [88]:
y_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [82]:
test

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Title,Age_Group,Fare_Group
0,3,1.0,0,0,1.0,2.0,6.0,1.0
1,3,0.0,1,0,2.0,3.0,0.0,1.0
2,2,1.0,0,0,1.0,2.0,3.0,2.0
3,3,1.0,0,0,2.0,2.0,6.0,2.0
4,3,0.0,1,1,2.0,3.0,4.0,2.0
...,...,...,...,...,...,...,...,...
413,3,1.0,0,0,2.0,2.0,6.0,2.0
414,1,0.0,0,0,0.0,4.0,0.0,3.0
415,3,1.0,0,0,2.0,2.0,0.0,1.0
416,3,1.0,0,0,2.0,2.0,6.0,2.0


In [84]:
# 제출 샘플 파일의 Survived 칼럼을 예측값(y_pred)으로 변환
submission['Survived'] = y_pred

In [85]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [86]:
# submission.csv 파일 만들기
submission.to_csv('submission.csv', index=False)