# Feature Engineering

## 데이터 전처리 및 특성 추출

In [2]:
import pandas as pd

In [3]:
# 훈련 데이터와 테스트 데이터 합치기
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

train_test_data = [train, test]

### Name Feature

In [4]:
# ' ([A-Za-z]+)\.' : 공백으로 시작하고 .(점)으로 끝나는 문자열
for data in train_test_data:
    data['Title'] = data.Name.str.extract(' ([A-Za-z]+)\.')

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss


In [5]:
# 추출한 Title을 가진 사람의 수를 출력(성별에 따른)
# Pandas crosstab을 이용해서 빈도표 만들기
pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [6]:
for data in train_test_data:
    data['Title'] = data['Title'].replace(['Capt', 'Col', 'Don','Dona', 'Dr', 'Jonkheer', 'Major', 'Rev'], 'Other')
    data['Title'] = data['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    data['Title'] = data['Title'].replace('Ms', 'Miss')

In [7]:
pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,0,40
Miss,185,0
Mr,0,517
Mrs,126,0
Other,1,19
Royal,2,1


In [8]:
# Title을 숫자로 매핑하기
title_mapping = {
    'Master': 1, 'Miss': 2, 'Mr': 3,
    'Mrs': 4, 'Other': 5, 'Royal':6
}

for data in train_test_data:
    data['Title'] = data['Title'].map(title_mapping)

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2


### Sex feature

In [9]:
pd.crosstab(train['Sex'], train['Survived'])

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [10]:
sex_mapping = {
    'male': 0, 'female': 1
}

for data in train_test_data:
    data['Sex'] = data['Sex'].map(sex_mapping)

In [11]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,4
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,2


### Embarked Feature

In [12]:
# NaN 값 존재 확인
train.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

- 대부분의 값이 S 이므로 NaN을 S로 간주 한다.

In [13]:
for data in train_test_data:
    data['Embarked'] = data['Embarked'].fillna('S')

In [14]:
# Embarked mapping
embarked_mapping = {
    'S': 0, 'C': 1, 'Q': 2
}
for data in train_test_data:
    data['Embarked'] = data['Embarked'].map(embarked_mapping)

In [15]:
# 확인
train.Embarked.value_counts()

0    646
1    168
2     77
Name: Embarked, dtype: int64

### Age Feature

In [16]:
for data in train_test_data:
    data['Age'].fillna(data['Age'].mean(), inplace=True)
    data['Age'] = data['Age'].astype(int)
    train['AgeBand'] = pd.cut(train['Age'], 5)

In [17]:
train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean()

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.344762
2,"(32.0, 48.0]",0.403226
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [18]:
for data in train_test_data:
    data.loc[data['Age'] <= 16, 'Age'] = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age'] = 4


In [19]:
train.Age.value_counts()

1    525
2    186
0    100
3     69
4     11
Name: Age, dtype: int64

### Fare Feature

In [20]:
train[['Pclass', 'Fare']].groupby(['Pclass'], as_index=False).mean()

Unnamed: 0,Pclass,Fare
0,1,84.154687
1,2,20.662183
2,3,13.67555


In [21]:
test[test['Fare'].isnull()]['Pclass']

152    3
Name: Pclass, dtype: int64

In [22]:
for data in train_test_data:
    # pclass 가 3인 것의 평균 이용
    data['Fare'] = data['Fare'].fillna(13.675550)

In [23]:
for data in train_test_data:
    data.loc[ data['Fare'] <= 7.854, 'Fare'] = 0
    data.loc[(data['Fare'] > 7.854) & (data['Fare'] <= 10.5), 'Fare'] = 1
    data.loc[(data['Fare'] > 10.5) & (data['Fare'] <= 21.679), 'Fare']   = 2
    data.loc[(data['Fare'] > 21.679) & (data['Fare'] <= 39.688), 'Fare']   = 3
    data.loc[ data['Fare'] > 39.688, 'Fare'] = 4

In [24]:
train.Fare.value_counts()

1.0    197
3.0    181
4.0    176
2.0    171
0.0    166
Name: Fare, dtype: int64

### SibSp and Parch Feature (= Family)

- 형제, 자매, 배우자, 부모님, 자녀의 수가 많을 수록 생존한 경우가 많다.

- 두개의 Feature를 합쳐서 Family라는 새로운 Feature로 만들기

In [25]:
for data in train_test_data:
    data["Family"] = data["Parch"] + data["SibSp"]
    

In [26]:
train.Family.value_counts()

0     537
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: Family, dtype: int64

### 특성 추출 및 나머지 전처리

- 학습 시킬때 제외시킬 Feature들은 Drop

In [27]:
print(train.columns)
print(test.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'AgeBand',
       'Family'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Family'],
      dtype='object')


- 사용 안하는 feature

    - Name, Ticket, Cabin, SibSp, Parch

In [28]:
features_drop = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']

train = train.drop(features_drop + ['AgeBand', 'PassengerId'], axis=1)
test = test.drop(features_drop, axis=1)


In [29]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Family
0,0,3,0,1,0.0,0,3,1
1,1,1,1,2,4.0,1,4,1
2,1,3,1,1,1.0,0,2,0
3,1,1,1,2,4.0,0,4,1
4,0,3,0,2,1.0,0,3,0


In [30]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,Family
0,892,3,0,2,0.0,2,3,0
1,893,3,1,2,0.0,0,4,1
2,894,2,0,3,1.0,2,3,0
3,895,3,0,1,1.0,0,3,0
4,896,3,1,1,2.0,0,4,2


- Categorical Feature에 대해 one-hot encoding

In [31]:
# one-hot encoding for categorical variables
train = pd.get_dummies(train)
test = pd.get_dummies(test)

- train data와 label을 분리- train data와 label을 분리

In [32]:
train_label = train['Survived']
train_data = train.drop('Survived', axis=1)

In [33]:
test_data = test.copy()

In [34]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,Family
0,892,3,0,2,0.0,2,3,0
1,893,3,1,2,0.0,0,4,1
2,894,2,0,3,1.0,2,3,0
3,895,3,0,1,1.0,0,3,0
4,896,3,1,1,2.0,0,4,2


In [35]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,Family
0,3,0,1,0.0,0,3,1
1,1,1,2,4.0,1,4,1
2,3,1,1,1.0,0,2,0
3,1,1,2,4.0,0,4,1
4,3,0,2,1.0,0,3,0


In [36]:
train_label.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

### DataFrame을 csv로 저장하기

- data의 index의 이름 추가하기

- csv로 저장하기(input 폴더에)

In [37]:
train_label.to_csv('../input/train_label.csv', mode='w', index=False)

In [38]:
train_data.to_csv('../input/train_data.csv', mode='w', index=False)

In [39]:
test_data.to_csv('../input/test_data.csv', mode='w', index=False)