## 과제.
타이타닉 생존자 예측모델 개발을 위한 전처리 및 통계적 시각적 탐색을 통하여 최적의 분석용 데이터셋을 작성하세요.

#### Titanic data 전처리
- 분석 데이터 : titanic3.csv
- 재사용 가능한 전처리 사용자 함수 작성 하여 전처리
    - Null 값 처리 : Age는 평균나이, 나머지 칼럼은 'N'값으로 변경
    - 불필요한 속성 칼럼 삭제
    - 문자열 칼럼 레이블 인코딩
- 통계적, 시각적 탐색을 통한 다양한 인사이트 도출
- 탐색적 분석을 통한 feature engineering, 파생변수 

#### 컬럼 정보

- survived : 생존여부(1: 생존, 0 : 사망)
- pclass : 승선권 클래스(1 : 1st, 2 : 2nd ,3 : 3rd)
- name : 승객 이름
- sex : 승객 성별
- age : 승객 나이
- sibsp : 동반한 형제자매, 배우자 수
- parch : 동반한 부모, 자식 수
- ticket : 티켓의 고유 넘버
- fare 티켓의 요금
- cabin : 객실 번호
- embarked : 승선한 항구명(C : Cherbourg, Q : Queenstown, S : Southampton)
- boat
- body
- home.dest

In [1]:
import pandas as pd
import re
import matplotlib as plt

df = pd.read_csv('../판다스/dataset/dataset_base/titanic3.csv')

df.drop(['embarked'], axis = 1, inplace = True)
df.drop(['home.dest'], axis = 1, inplace = True)
df.drop(['ticket'], axis = 1, inplace = True)
# df.drop(['boat'], axis = 1, inplace = True)
df.drop(['cabin'], axis = 1, inplace = True)
df.drop(['body'], axis = 1, inplace = True)

df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,fare,boat
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,211.3375,2.0
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,151.55,11.0
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,151.55,
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,151.55,
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,151.55,


In [2]:
def extract_word(name):
    match = re.search(r',\s(.+?)\.', name)
    if match:
        return match.group(1)
    else:
        return None

In [3]:
df.insert(2, 'word', df['name'].apply(extract_word))

In [4]:
df.drop(['name'], axis = 1, inplace = True)

In [5]:
df['age'] = df['age'].fillna(df['age'].mean())
df = df.fillna('N')
df['age'] = df['age'].round(2)

df.head()

Unnamed: 0,pclass,survived,word,sex,age,sibsp,parch,fare,boat
0,1,1,Miss,female,29.0,0,0,211.3375,2
1,1,1,Master,male,0.92,1,2,151.55,11
2,1,0,Miss,female,2.0,1,2,151.55,N
3,1,0,Mr,male,30.0,1,2,151.55,N
4,1,0,Mrs,female,25.0,1,2,151.55,N


In [6]:
df.head()

Unnamed: 0,pclass,survived,word,sex,age,sibsp,parch,fare,boat
0,1,1,Miss,female,29.0,0,0,211.3375,2
1,1,1,Master,male,0.92,1,2,151.55,11
2,1,0,Miss,female,2.0,1,2,151.55,N
3,1,0,Mr,male,30.0,1,2,151.55,N
4,1,0,Mrs,female,25.0,1,2,151.55,N


In [7]:
df.corr()

Unnamed: 0,pclass,survived,age,sibsp,parch
pclass,1.0,-0.312469,-0.366378,0.060832,0.018322
survived,-0.312469,1.0,-0.050195,-0.027825,0.08266
age,-0.366378,-0.050195,1.0,-0.190747,-0.130869
sibsp,0.060832,-0.027825,-0.190747,1.0,0.373587
parch,0.018322,0.08266,-0.130869,0.373587,1.0


In [8]:
df['word'].unique()

array(['Miss', 'Master', 'Mr', 'Mrs', 'Col', 'Mme', 'Dr', 'Major', 'Capt',
       'Lady', 'Sir', 'Mlle', 'Dona', 'Jonkheer', 'the Countess', 'Don',
       'Rev', 'Ms'], dtype=object)

In [9]:
df['sex'] = df.sex.replace({'male' : 0, 'female' : 1})
df['word'] = df.word.replace({'Miss' : 0, 'Master' :1, 'Mr' : 2, 'Mrs' : 3, 'Col' : 4, 'Mme' : 5, 'Dr' : 6, 'Major' : 7, 'Capt' : 8,
       'Lady' : 9, 'Sir' : 10, 'Mlle' : 11, 'Dona' : 12, 'Jonkheer' : 13, 'the Countess' : 14, 'Don' : 15,
       'Rev' : 16, 'Ms' : 17})
df

Unnamed: 0,pclass,survived,word,sex,age,sibsp,parch,fare,boat
0,1,1,0,1,29.00,0,0,211.3375,2
1,1,1,1,0,0.92,1,2,151.55,11
2,1,0,0,1,2.00,1,2,151.55,N
3,1,0,2,0,30.00,1,2,151.55,N
4,1,0,3,1,25.00,1,2,151.55,N
...,...,...,...,...,...,...,...,...,...
1304,3,0,0,1,14.50,1,0,14.4542,N
1305,3,0,0,1,29.88,1,0,14.4542,N
1306,3,0,2,0,26.50,0,0,7.225,N
1307,3,0,2,0,27.00,0,0,7.225,N


In [10]:
# df.loc[(df['survived'] == 1) & (df['boat'] == 'N'), 'boat'] = 1
# df.loc[(df['survived'] == 1) & (df['boat'] != 'N'), 'boat'] = 1
# df.loc[(df['survived'] == 0) & (df['boat'] == 'N'), 'boat'] = 0
# df.loc[(df['survived'] == 0) & (df['boat'] != 'N'), 'boat'] = 0
df.drop(['boat'], axis = 1, inplace = True)

# df

In [11]:
df.loc[(df['pclass'] == 1) & (df['fare'] == 'N'), 'fare'] = 87.5
df.loc[(df['pclass'] == 2) & (df['fare'] == 'N'), 'fare'] = 21.7
df.loc[(df['pclass'] == 3) & (df['fare'] == 'N'), 'fare'] = 13.3

# df.fare.astype(int)
# df.loc[df['pclass'] == 1, 'fare'].min() # 87.5, 0, 512
# df.loc[df['pclass'] == 2, 'fare'].min() # 21.7, 0, 73,5
# df.loc[df['pclass'] == 3, 'fare'].min() # 13.3, 0, 69.5

df

Unnamed: 0,pclass,survived,word,sex,age,sibsp,parch,fare
0,1,1,0,1,29.00,0,0,211.3375
1,1,1,1,0,0.92,1,2,151.55
2,1,0,0,1,2.00,1,2,151.55
3,1,0,2,0,30.00,1,2,151.55
4,1,0,3,1,25.00,1,2,151.55
...,...,...,...,...,...,...,...,...
1304,3,0,0,1,14.50,1,0,14.4542
1305,3,0,0,1,29.88,1,0,14.4542
1306,3,0,2,0,26.50,0,0,7.225
1307,3,0,2,0,27.00,0,0,7.225


In [12]:
# df.loc[(df['fare'] > 21.7), 'fare'] = 1
# df.loc[(df['fare'] > 13.3) & (df['fare'] <= 21.7), 'fare'] = 2
# df.loc[(df['fare'] >= 0) & (df['fare'] <= 13.3), 'fare'] = 3
# df

In [13]:
df

Unnamed: 0,pclass,survived,word,sex,age,sibsp,parch,fare
0,1,1,0,1,29.00,0,0,211.3375
1,1,1,1,0,0.92,1,2,151.55
2,1,0,0,1,2.00,1,2,151.55
3,1,0,2,0,30.00,1,2,151.55
4,1,0,3,1,25.00,1,2,151.55
...,...,...,...,...,...,...,...,...
1304,3,0,0,1,14.50,1,0,14.4542
1305,3,0,0,1,29.88,1,0,14.4542
1306,3,0,2,0,26.50,0,0,7.225
1307,3,0,2,0,27.00,0,0,7.225


In [14]:
df['family'] = ((df['sibsp'] != 0) | (df['parch'] != 0)).astype(int)


In [15]:
df.drop(['sibsp', 'parch'], axis = 1, inplace = True)


In [16]:
df

Unnamed: 0,pclass,survived,word,sex,age,fare,family
0,1,1,0,1,29.00,211.3375,0
1,1,1,1,0,0.92,151.55,1
2,1,0,0,1,2.00,151.55,1
3,1,0,2,0,30.00,151.55,1
4,1,0,3,1,25.00,151.55,1
...,...,...,...,...,...,...,...
1304,3,0,0,1,14.50,14.4542,1
1305,3,0,0,1,29.88,14.4542,1
1306,3,0,2,0,26.50,7.225,0
1307,3,0,2,0,27.00,7.225,0


In [18]:
df.to_pickle('../판다스/dataset/dataset_base/tdf.pkl')
titanic_df = pd.read_pickle('../판다스/dataset/dataset_base/tdf.pkl')