### Machine Learning 과정

- 문제정의
- 데이터 수집
- 데이터 전처리 (인코딩, 특성공학)
- 탐색적 데이터 분석 (시각화, 특성선택)
- 모델 선택 및 학습
- 하이퍼 파라미터 튜닝 (교차검증, 그리드서치)
- 모델 평가

In [64]:
import numpy as np
import pandas as pd

### 데이터 사전 
- 각 컬럼의 정보 저장
- PassengerId : 탑승객 id

### 데이터 수집 및 로드

In [65]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [66]:
# targets 변수에 정답 레이블 담아주세요!
targets = train['Survived']
targets

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

### 특성 공학 Feature Engineering

In [67]:
# 생존여부(Survived) 따로 분리, 저장
# train 데이터에서 Survived drop, inplace = True
train.drop('Survived',axis=1,inplace= True)

In [68]:
train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [69]:
test.drop('Survived',axis=1,inplace= True)

In [70]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [71]:
print(train.shape)
print(test.shape)

(891, 11)
(418, 11)


In [72]:
# concat 이용 병합
# ignore_index = True 기존 train, test 인덱스는 무시하고 순차적으로 만든다.
combined = pd.concat([train, test], ignore_index = True)

In [73]:
#PassengerId 삭제 - 기존의 인덱스로 설정한 컬럼, 여기서는 불필요
combined.drop('PassengerId', axis=1, inplace=True)
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1046 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Ticket    1309 non-null   object 
 7   Fare      1308 non-null   float64
 8   Cabin     295 non-null    object 
 9   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 102.4+ KB


In [74]:
combined

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## 요금 결측치 처리

In [75]:
# Fare 결측치 있는 행 출력
combined[combined ['Fare'].isnull()]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1043,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [76]:
combined['Fare'].mean()

33.2954792813456

In [77]:
combined['Fare'].fillna(combined['Fare'].mean(), inplace = True)

In [78]:
combined[combined['Fare'].isnull()]

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


### 탑승객 호칭 처리

In [79]:
# Name 컬럼 인덱싱
# 5개의 행만 봅시다.

combined['Name'].shape

(1309,)

In [80]:
def split_title(x):
    return x.split(",")[1].split(".")[0].strip()

In [81]:
# 데이터 전처리를 한꺼번에 적용하기 위해서 combined로 적용
# train을 기준으로 결측치를 채움
train['Name'].apply(split_title)

0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886     Rev
887    Miss
888    Miss
889      Mr
890      Mr
Name: Name, Length: 891, dtype: object

몇 개의 타이틀로 정리

- Officer (장교) : 'Capt',"Col", "Major" ,"Dr", "Rev"
- Royalty (귀족) : "Jonkheer", "Don",  "Sir", "the Countess", "Lady"
- Mr :"Mr"
- Mrs : "Mme", "Ms", "Mrs"
- Miss : "Mlle", "Miss" 
- Master : "Master"

In [82]:
Title_Dic = {
    'Capt' : 'Officer', 'Col' : 'Officer', 'Major' : 'Officer', 'Dr' : 'Officer', 'Rev' : 'Officer',
    'Jonkheer' : 'Royalty', 'Don' : 'Royalty', 'Sir' : 'Royalty', 'the Countess' : 'Royalty', 'Lady' : 'Royalty', 'Dona' : 'Royalty',
    'Mr' : 'Mr',
    'Mme' : 'Mrs', 'Ms' : 'Mrs', 'Mrs' : 'Mrs',
    'Mlle' : 'Miss', 'Miss' : 'Miss',
    'Master' : 'Master'
}

In [83]:
Title_Dic

{'Capt': 'Officer',
 'Col': 'Officer',
 'Major': 'Officer',
 'Dr': 'Officer',
 'Rev': 'Officer',
 'Jonkheer': 'Royalty',
 'Don': 'Royalty',
 'Sir': 'Royalty',
 'the Countess': 'Royalty',
 'Lady': 'Royalty',
 'Dona': 'Royalty',
 'Mr': 'Mr',
 'Mme': 'Mrs',
 'Ms': 'Mrs',
 'Mrs': 'Mrs',
 'Mlle': 'Miss',
 'Miss': 'Miss',
 'Master': 'Master'}

In [84]:
combined['Title'] = combined['Name'].apply(split_title)
#map 함수를 통해서 딕셔너리의 키 값과 시리즈 인덱스 값이 같은 데이터를 찾아서 변경
combined['Title'] = combined['Title'].map(Title_Dic)

In [85]:
combined.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [86]:
combined['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Royalty', 'Officer'], dtype=object)

### 나이 결측치 처리
- 좀 더 세분화 해서 성별, 선실 등급, 호칭으로 묶어서 나이를 구해보자

In [87]:
# as_index = False 속성으로 그룹을 묶는 컬럼을 인덱스에서 제외
group_median_train = combined.iloc[:891].groupby(['Sex','Pclass','Title'], as_index = False).median()

In [88]:
#성별, 선실등급, 호칭, 나이 컬럼만 인덱싱 해주세요!
group_median_train = group_median_train[['Sex','Pclass','Title', 'Age']]
#group_median_train = group_median_train.iloc[:,:4]

In [89]:
group_median_train

Unnamed: 0,Sex,Pclass,Title,Age
0,female,1,Miss,30.0
1,female,1,Mrs,40.0
2,female,1,Officer,49.0
3,female,1,Royalty,40.5
4,female,2,Miss,24.0
5,female,2,Mrs,31.5
6,female,3,Miss,18.0
7,female,3,Mrs,31.0
8,male,1,Master,4.0
9,male,1,Mr,40.0


In [90]:
# 내부 구조 이해
condition = (
            (group_median_train['Sex'] == combined.loc[0,'Sex']) &
            (group_median_train['Pclass'] == combined.loc[0,'Pclass']) &
            (group_median_train['Title'] == combined.loc[0,'Title'])
)

print(group_median_train[condition])
print("----"*10)
print(group_median_train[condition]['Age'].values[0])

     Sex  Pclass Title   Age
16  male       3    Mr  26.0
----------------------------------------
26.0


In [91]:
def fill_age(row):
    condition = (
                (group_median_train['Sex'] == row['Sex']) &
                (group_median_train['Pclass'] == row['Pclass']) &
                (group_median_train['Title'] == row['Title'])
    )
    if np.isnan(row['Age']):
        return group_median_train[condition]['Age']
    else:
        return row['Age']

In [92]:
combined['Age'] = combined.apply(fill_age, axis=1)
combined['Age']

0           22
1           38
2           26
3           35
4           35
         ...  
1304    [26.0]
1305        39
1306      38.5
1307    [26.0]
1308     [4.0]
Name: Age, Length: 1309, dtype: object

### Name 처리
- 이름 특성 삭제
- 카테고리화 되어있는 호칭을 Model이 계산할 수 있도록 one-hot-encoding

In [93]:
# 이름 특성 삭제
combined.drop('Name', axis=1, inplace=True)
combined.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,3,male,22,1,0,A/5 21171,7.25,,S,Mr
1,1,female,38,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,female,26,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,female,35,1,0,113803,53.1,C123,S,Mrs
4,3,male,35,0,0,373450,8.05,,S,Mr


In [94]:
X_one_hot = pd.get_dummies('Title')

In [95]:
# 'Title'특성을 원핫인코딩, prefix='Title'
#pd.get_dimmies()

title_dummies = pd.get_dummies(combined['Title'], prefix = 'Title')
title_dummies

Unnamed: 0,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,0,0,1,0,0,0
1,0,0,0,1,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,0,1,0,0,0
...,...,...,...,...,...,...
1304,0,0,1,0,0,0
1305,0,0,0,0,0,1
1306,0,0,1,0,0,0
1307,0,0,1,0,0,0


In [96]:
#concat 이용해서 병합
combined = pd.concat([combined, title_dummies], axis=1)
combined.drop('Title', axis=1, inplace=True)

In [97]:
combined.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,3,male,22,1,0,A/5 21171,7.25,,S,0,0,1,0,0,0
1,1,female,38,1,0,PC 17599,71.2833,C85,C,0,0,0,1,0,0
2,3,female,26,0,0,STON/O2. 3101282,7.925,,S,0,1,0,0,0,0
3,1,female,35,1,0,113803,53.1,C123,S,0,0,0,1,0,0
4,3,male,35,0,0,373450,8.05,,S,0,0,1,0,0,0


### 승선항(Embarked) 처리
- 결측치는 많은 사람들이 탑승한 S로 채운다.
- one-hot-encoding

In [98]:
# 891개의 행까지 combined에서 인덱싱하고 승선항의 빈도를 세기
# value_counts()
combined.loc[:891,'Embarked'].value_counts()

S    644
C    168
Q     78
Name: Embarked, dtype: int64

In [99]:
# 결측치가 있다면 'S'로 처리해주세요!
#combined(train,test)
embarked_dummies = pd.get_dummies(combined['Embarked']).fillna('S')

In [100]:
# 'Embarked' one-hot-encoding
embarked_dummies = pd.get_dummies(combined['Embarked'],prefix='Embarked')
embarked_dummies

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
1304,0,0,1
1305,1,0,0
1306,0,0,1
1307,0,0,1


In [101]:
#combined와 embarked_dummies 병합 후 Embarked 컬럼 삭제
combined = pd.concat([combined, embarked_dummies], axis=1)
combined.drop('Embarked', axis=1, inplace=True)
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         1309 non-null   int64  
 1   Sex            1309 non-null   object 
 2   Age            1309 non-null   object 
 3   SibSp          1309 non-null   int64  
 4   Parch          1309 non-null   int64  
 5   Ticket         1309 non-null   object 
 6   Fare           1309 non-null   float64
 7   Cabin          295 non-null    object 
 8   Title_Master   1309 non-null   uint8  
 9   Title_Miss     1309 non-null   uint8  
 10  Title_Mr       1309 non-null   uint8  
 11  Title_Mrs      1309 non-null   uint8  
 12  Title_Officer  1309 non-null   uint8  
 13  Title_Royalty  1309 non-null   uint8  
 14  Embarked_C     1309 non-null   uint8  
 15  Embarked_Q     1309 non-null   uint8  
 16  Embarked_S     1309 non-null   uint8  
dtypes: float64(1), int64(3), object(4), uint8(9)
memory 

### Cabin 처리
- 결측치 'U'로 대체
- 숫자를 제거한 맨 앞 글자만 추출해서 Deck 컬럼 생성 후 담아줌
- Cabin 컬럼 삭제
- 원핫인코딩 cabin_dummies
- concat 해서 combined, cabin_dummies 병합


In [102]:
# 결측치 'U'로 대체
combined['Cabin'] = combined['Cabin'].fillna('U')

In [103]:
#숫자를 제거한 맨 앞 글자만 추출해서 Deck 컬럼 생성 후 담아줌
combined['Deck'] = combined['Cabin'].str[0]
combined['Deck']
#Cabin 컬럼 삭제
combined.drop('Cabin', axis=1, inplace = True)

In [104]:
#원핫 인코딩 cabin_dummies
deck_dummies =  pd.get_dummies(combined['Deck'], prefix = 'Deck' )
#concat해서 combined, cabin_dummies 병합
combined = pd.concat([combined,deck_dummies], axis = 1)
#'Deck' 컬럼 삭제
combined.drop('Deck', axis=1, inplace = True)
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         1309 non-null   int64  
 1   Sex            1309 non-null   object 
 2   Age            1309 non-null   object 
 3   SibSp          1309 non-null   int64  
 4   Parch          1309 non-null   int64  
 5   Ticket         1309 non-null   object 
 6   Fare           1309 non-null   float64
 7   Title_Master   1309 non-null   uint8  
 8   Title_Miss     1309 non-null   uint8  
 9   Title_Mr       1309 non-null   uint8  
 10  Title_Mrs      1309 non-null   uint8  
 11  Title_Officer  1309 non-null   uint8  
 12  Title_Royalty  1309 non-null   uint8  
 13  Embarked_C     1309 non-null   uint8  
 14  Embarked_Q     1309 non-null   uint8  
 15  Embarked_S     1309 non-null   uint8  
 16  Deck_A         1309 non-null   uint8  
 17  Deck_B         1309 non-null   uint8  
 18  Deck_C  

### 성별처리

In [105]:
combined['Sex'].unique()

array(['male', 'female'], dtype=object)

In [106]:
combined['Sex'] = combined['Sex'].map({'male':1, 'female':0})
# 라벨, 레이블 인코딩

In [107]:
combined.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Title_Master,Title_Miss,Title_Mr,...,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,3,1,22,1,0,A/5 21171,7.25,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,0,38,1,0,PC 17599,71.2833,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,0,26,0,0,STON/O2. 3101282,7.925,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,0,35,1,0,113803,53.1,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,3,1,35,0,0,373450,8.05,0,0,1,...,1,0,0,0,0,0,0,0,0,1


### Pclass 등급 처리

In [108]:
# Pclass 원핫인코딩
pclass_dummies = pd.get_dummies(combined['Pclass'], prefix = 'Pclass')
pclass_dummies

#concat 병합
combined = pd.concat([combined,pclass_dummies], axis=1)

In [109]:
# 기존 Pclass 삭제
combined.drop('Pclass', axis = 1, inplace=True)

In [110]:
#combined.head()
combined.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Ticket,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U,Pclass_1,Pclass_2,Pclass_3
0,1,22,1,0,A/5 21171,7.25,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
1,0,38,1,0,PC 17599,71.2833,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,0,26,0,0,STON/O2. 3101282,7.925,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0,35,1,0,113803,53.1,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
4,1,35,0,0,373450,8.05,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1


### 가족 관련 특성 처리
- 부모, 자녀, 형제, 배우자 모두 합친 특성 새롭게 만들자
- 가족 숫자에 따라 1인, 소규모, 대규모 가족으로 구분하자

In [111]:
combined.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Title_Master',
       'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Deck_A', 'Deck_B', 'Deck_C',
       'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_U', 'Pclass_1',
       'Pclass_2', 'Pclass_3'],
      dtype='object')

In [112]:
#본인 포함해서 모든 가족수 특성 생성
combined['FamilySize'] = combined['SibSp']+combined['Parch'] + 1

# map 함수 이용
# map(함수
# lambda : 간단한 함수를 줄여서 쓰는 방식 , 함수이름x , 일시적으로 사용했다가 사라지는 함수
combined['Single'] = combined['FamilySize'].map(lambda s : 1 if s == 1 else 0)
combined['SmallFamily'] = combined['FamilySize'].map(lambda s : 1 if 2 <= s <= 4 else 0)
combined['LargeSingle'] = combined['FamilySize'].map(lambda s : 1 if 5 <= s else 0)

In [113]:
combined['FamilySize']

0       2
1       2
2       1
3       2
4       1
       ..
1304    1
1305    1
1306    1
1307    1
1308    3
Name: FamilySize, Length: 1309, dtype: int64

In [114]:
combined.drop(['FamilySize', 'SibSp', 'Parch'], axis = 1, inplace = True)

### 티켓 처리

In [115]:
def cleanTicket(ticket):
    # .과 / 를 없애줌
    ticket = ticket.replace('.','')
    ticket = ticket.replace('/','')
    # 공백 기준으로 자름
    ticket = ticket.split(' ')
    
    # 자른 리스트의 각 항목의 양쪽 공백을 없애줌
    ticket = map(lambda t: t.strip(), ticket)
    
    # 숫자가 아닌 것만 필터링해서 리스트로 만듦
    ticket = list(filter(lambda t : not t.isdigit(), ticket))
    if len(ticket)>0 : 
        return ticket[0]
    else : 
        return 'XXX' # 티켓 글자가 없으면 'XXX'로 표시

In [116]:
# isdigit() 예시
# 문자열 안에 숫자만 있으면 True, 아니면 False
'88a' .isdigit()
'88'.isdigit()

True

In [117]:
#filter() 예시
target = [1,2,3,4,5,6,7,8,9,10]
def is_even(n):
    return True if n % 2 == 0 else False

result = list(filter(is_even,target))
result
#-----
result = filter(lambda x: x%2 == 0, target)
print(result)

<filter object at 0x000001DD0F573748>


In [118]:
combined['Ticket'] = combined['Ticket'].map(cleanTicket)
combined['Ticket']

0            A5
1            PC
2        STONO2
3           XXX
4           XXX
         ...   
1304         A5
1305         PC
1306    SOTONOQ
1307        XXX
1308        XXX
Name: Ticket, Length: 1309, dtype: object

In [119]:
#원핫인코딩
ticket_dummies = pd.get_dummies(combined['Ticket'],prefix='Ticket')
# combined, ticket_dummies 병합
combined = pd.concat([combined,ticket_dummies],axis=1)
# drop -> Ticket
combined.drop('Ticket',axis=1,inplace=True)

In [120]:
combined.head() #살려줘

Unnamed: 0,Sex,Age,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,...,Ticket_SOTONO2,Ticket_SOTONOQ,Ticket_SP,Ticket_STONO,Ticket_STONO2,Ticket_STONOQ,Ticket_SWPP,Ticket_WC,Ticket_WEP,Ticket_XXX
0,1,22,7.25,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,38,71.2833,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,26,7.925,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,35,53.1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,35,8.05,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [121]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 64 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             1309 non-null   int64  
 1   Age             1309 non-null   object 
 2   Fare            1309 non-null   float64
 3   Title_Master    1309 non-null   uint8  
 4   Title_Miss      1309 non-null   uint8  
 5   Title_Mr        1309 non-null   uint8  
 6   Title_Mrs       1309 non-null   uint8  
 7   Title_Officer   1309 non-null   uint8  
 8   Title_Royalty   1309 non-null   uint8  
 9   Embarked_C      1309 non-null   uint8  
 10  Embarked_Q      1309 non-null   uint8  
 11  Embarked_S      1309 non-null   uint8  
 12  Deck_A          1309 non-null   uint8  
 13  Deck_B          1309 non-null   uint8  
 14  Deck_C          1309 non-null   uint8  
 15  Deck_D          1309 non-null   uint8  
 16  Deck_E          1309 non-null   uint8  
 17  Deck_F          1309 non-null   u

### 모델링

In [122]:
# X_train, y_train, X_test 분리
# X_train은 890row까지 분리
X_train = combined.iloc[:891]
y_train = targets #훈련 정답
X_test = combined.iloc[891:] #테스트 문제

In [123]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(891, 64)
(891,)
(418, 64)


#### 앙상블

In [124]:
from sklearn.ensemble import VotingClassifier #Voting : 같은 모델들을 앙상블 한다.
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

knn_model = KNeighborsClassifier()
tree_model = DecisionTreeClassifier()
logi_model = LogisticRegression()
forest_model = RandomForestClassifier()

In [125]:
# voting
voting_model = VotingClassifier(
    estimators = [
        ('knn1',knn_model),
        ('tree1', tree_model),
        ('logi1', logi_model),
        ('random1', forest_model)],
    voting = 'soft')

In [126]:
# 학습
voting_model.fit(X_train, y_train) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[('knn1',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=5,
                                                   p=2, weights='uniform')),
                             ('tree1',
                              DecisionTreeClassifier(ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity

In [127]:
pre = voting_model.predict(X_test)
pre

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [128]:
result = pd.read_csv('data/gender_submission.csv') ##케글 제출 정답 파일 불러오기
result['Survived'] = pre
result.to_csv('kjw_submission_02.csv', index=False)

In [147]:
#그리드 서치
from sklearn.model_selection import GridSearchCV

In [148]:
param_grid={
    'max_depth' : [5,10,15,20],
    'n_estimators' :[1000,1500,2000,2500],
    'max_features' : [0.5, 0.7],
    'max_leaf_nodes' : [20,50,80]
}

In [152]:
forest_model = RandomForestClassifier(n_estimators=1000,
                                     max_features = 0.7,
                                     max_depth = 5,
                                     min_samples_leaf = 15, #샘플 최소 개수 15개
                                     max_leaf_nodes = 50
                                     )

In [153]:
grid = GridSearchCV(forest_model,
                   param_grid,
                   cv=3)

In [154]:
grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=5,
                                              max_features=0.7,
                                              max_leaf_nodes=50,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=15,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=1000, n_jobs=None,
                                              oob_score=False,
                                              random_stat

In [157]:
# 최적의 매게변수 찾기
print('정확도 : ',grid.best_score_)
print('최적 파라미터 :', grid.best_params_)

정확도 :  0.8249158249158248
최적 파라미터 : {'max_depth': 10, 'max_features': 0.5, 'max_leaf_nodes': 50, 'n_estimators': 1500}


In [158]:
final_forest_model = RandomForestClassifier(max_depth = 10, 
                                            max_features = 0.5, 
                                            max_leaf_nodes = 50, 
                                            n_estimators = 1500)

In [160]:
final_forest_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features=0.5,
                       max_leaf_nodes=50, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [161]:
pre = final_forest_model.predict(X_test)

In [162]:
result2 = pd.read_csv('data/gender_submission.csv')
result2['Survived'] = pre
result2.to_csv('krwd_summission_02.csv', index=False)