# Feature Engineering

## 데이터 전처리 및 특성 추출

In [1]:
import pandas as pd

In [2]:
# 훈련 데이터와 테스트 데이터 합치기
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

train_test_data = [train, test]

### Name Feature

In [3]:
# ' ([A-Za-z]+)\.' : 공백으로 시작하고 .(점)으로 끝나는 문자열
for data in train_test_data:
    data['Title'] = data.Name.str.extract(' ([A-Za-z]+)\.')

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss


In [4]:
# 추출한 Title을 가진 사람의 수를 출력(성별에 따른)
# Pandas crosstab을 이용해서 빈도표 만들기
pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [5]:
for data in train_test_data:
    data['Title'] = data['Title'].replace(['Capt', 'Col', 'Don','Dona', 'Dr', 'Jonkheer', 'Major', 'Rev'], 'Other')
    data['Title'] = data['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    data['Title'] = data['Title'].replace('Ms', 'Miss')

In [6]:
pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,0,40
Miss,185,0
Mr,0,517
Mrs,126,0
Other,1,19
Royal,2,1


In [7]:
# Title을 숫자로 매핑하기
title_mapping = {
    'Master': 1, 'Miss': 2, 'Mr': 3,
    'Mrs': 4, 'Other': 5, 'Royal':6
}

for data in train_test_data:
    data['Title'] = data['Title'].map(title_mapping)

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2


### Sex feature

In [8]:
pd.crosstab(train['Sex'], train['Survived'])

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [9]:
sex_mapping = {
    'male': 0, 'female': 1
}

for data in train_test_data:
    data['Sex'] = data['Sex'].map(sex_mapping)

In [10]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,4
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,2


### Embarked Feature

In [11]:
# NaN 값 존재 확인
train.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

- 대부분의 값이 S 이므로 NaN을 S로 간주 한다.

In [12]:
for data in train_test_data:
    data['Embarked'] = data['Embarked'].fillna('S')

In [13]:
# Embarked mapping
embarked_mapping = {
    'S': 0, 'C': 1, 'Q': 2
}
for data in train_test_data:
    data['Embarked'] = data['Embarked'].map(embarked_mapping)

In [14]:
# 확인
train.Embarked.value_counts()

0    646
1    168
2     77
Name: Embarked, dtype: int64

### Age Feature

In [17]:
for data in train_test_data:
    data['Age'].fillna(data['Age'].mean(), inplace=True)
    data['Age'] = data['Age'].astype(int)
    train['AgeBand'] = pd.cut(train['Age'], 5)

In [19]:
train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean()

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.344762
2,"(32.0, 48.0]",0.403226
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [24]:
for data in train_test_data:
    data.loc[data['Age'] <= 16, 'Age'] = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age'] = 4


In [25]:
train.Age.value_counts()

1    525
2    186
0    100
3     69
4     11
Name: Age, dtype: int64