# Prepare Package

In [377]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Check and Preprocess Data

In [417]:
'''
pclass: A proxy for socio-economic status (SES)
 - 1st = Upper
 - 2nd = Middle
 - 3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way...
 - Parent = mother, father
 - Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.
'''
train_data = pd.read_csv("/home/ines/titanic/dataset/train.csv")
test_data = pd.read_csv("/home/ines/titanic/dataset/test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [419]:
train_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

## Title column 만들기
[이곳을 참고하였음](https://teddylee777.github.io/kaggle/kaggle(%EC%BA%90%EA%B8%80)-Titanic-%EC%83%9D%EC%A1%B4%EC%9E%90%EC%98%88%EC%B8%A1-81-%EC%9D%B4%EC%83%81-%EB%8B%AC%EC%84%B1%ED%95%98%EA%B8%B0)

In [378]:
def makeTitle(data):
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)

makeTitle(test_data)
makeTitle(train_data)

In [281]:
train_data = train_data.drop(['Survived'], axis=1)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  Title        891 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 83.7+ KB


In [283]:
# 데이터 합쳐서 가공하기
all_data = pd.concat([train_data, test_data], axis=0)

In [284]:
# Age, Fare, Embark, Cabin에서 결측값이 있는것을 확인!
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
 11  Title        1309 non-null   object 
dtypes: float64(2), int64(4), object(6)
memory usage: 132.9+ KB


In [382]:
# 결측값을 채우기위한 평균값 준비
mr_mean = all_data[train_data['Title'] == 'Mr']['Age'].mean()
miss_mean = all_data[train_data['Title'] == 'Miss']['Age'].mean()
mrs_mean = all_data[train_data['Title'] == 'Mrs']['Age'].mean()
master_mean = all_data[train_data['Title'] == 'Master']['Age'].mean()
dr_mean = all_data[train_data['Title'] == 'Dr']['Age'].mean()
rev_mean = all_data[train_data['Title'] == 'Rev']['Age'].mean()
major_mean = all_data[train_data['Title'] == 'Major']['Age'].mean()
mlle_mean = all_data[train_data['Title'] == 'Mlle']['Age'].mean()
col_mean = all_data[train_data['Title'] == 'Col']['Age'].mean()
age_mean = all_data['Age'].mean()

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  
  if __name__ == '__main__':


In [383]:
all_array_data = np.array(all_data)

In [384]:
for data in all_array_data:
    if np.isnan(data[4]):
        if data[11] == 'Mr':
            data[4] = round(mr_mean)
            #print("OK")
        elif data[11] == 'Miss':
            data[4] = round(miss_mean)
            #print("OK")
        elif data[11] == 'Mrs':
            data[4] = round(mrs_mean)
            #print("OK")
        elif data[11] == 'Master':
            data[4] = round(master_mean)
            #print("OK")
        elif data[11] == 'Dr':
            data[4] = round(dr_mean)
            #print("OK")
        elif data[11] == 'Rev':
            data[4] = round(rev_mean)
            #print("OK")
        elif data[11] == 'Major':
            data[4] = round(major_mean)
            #print("OK")
        elif data[11] == 'Mlle':
            data[4] = round(mlle_mean)
            #print("OK")
        elif data[11] == 'Col':
            data[4] = round(col_mean)
            #print("OK")
        else:
            data[4] = round(age_mean)
            #print("OK")

In [385]:
column_list = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title']

In [386]:
new_all_data = pd.DataFrame(all_array_data, columns=column_list)

In [387]:
# 나머지 남아있는 결측값은 결측값 위치의 바로 위의 값으로 결측값을 채우는 방법으로!
new_all_data['Embarked'] = new_all_data['Embarked'].fillna(method='pad')
new_all_data['Fare'] = new_all_data['Fare'].fillna(method='pad')

In [388]:
# 전부 채워진걸 확인
new_all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   object 
 1   Pclass       1309 non-null   object 
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1309 non-null   object 
 5   SibSp        1309 non-null   object 
 6   Parch        1309 non-null   object 
 7   Ticket       1309 non-null   object 
 8   Fare         1309 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1309 non-null   object 
 11  Title        1309 non-null   object 
dtypes: float64(1), object(11)
memory usage: 122.8+ KB


# 사용할 데이터들 매핑하기

In [389]:
# 연령대 별로 나누기
for data in [new_all_data]:
    data.loc[data['Age'] <= 10, 'Age'] = 0,
    data.loc[(data['Age'] > 10) & (data['Age'] <= 20), 'Age'] = 1,
    data.loc[(data['Age'] > 20) & (data['Age'] <= 30), 'Age'] = 2,
    data.loc[(data['Age'] > 30) & (data['Age'] <= 40), 'Age'] = 3,
    data.loc[(data['Age'] > 40) & (data['Age'] <= 50), 'Age'] = 4,
    data.loc[(data['Age'] > 50) & (data['Age'] <= 60), 'Age'] = 5,
    data.loc[(data['Age'] > 60) & (data['Age'] <= 70), 'Age'] = 6,
    data.loc[data['Age'] > 70, 'Age'] = 7

In [390]:
# 클래스별 Fare의 평균값을 이용해서 경계만들어서 매핑
p1 = new_all_data[new_all_data['Pclass']==1]
p2 = new_all_data[new_all_data['Pclass']==2]
p3 = new_all_data[new_all_data['Pclass']==3]
p1_mean = p1['Fare'].mean()
p2_mean = p2['Fare'].mean()
p3_mean = p3['Fare'].mean()
r1 = (p2_mean - p3_mean) / 2
r2 = (p1_mean - p2_mean) / 2

In [391]:
for data in [new_all_data]:
    data.loc[data['Fare'] <= p3_mean+r1, 'Fare'] = 0,
    data.loc[(data['Fare'] > p3_mean+r1) & (data['Fare'] <= p2_mean+r2), 'Fare'] = 1,
    data.loc[data['Fare'] > p2_mean+r2, 'Fare'] = 2

In [392]:
# 성별 매핑
for data in [new_all_data]:
    data.loc[data['Sex'] == 'male', 'Sex'] = 0,
    data.loc[data['Sex'] == 'female', 'Sex'] = 1

In [393]:
# Embarked 매핑
for data in [new_all_data]:
    data.loc[data['Embarked'] == 'S', 'Embarked'] = 0,
    data.loc[data['Embarked'] == 'C', 'Embarked'] = 1,
    data.loc[data['Embarked'] == 'Q', 'Embarked'] = 2

In [394]:
# 타이틀을 매핑, 개수가 작은 Title들은 나머지와 묶어서 처리
for data in [new_all_data]:
    data.loc[data['Title'] == 'Mr', 'Title'] = 0,
    data.loc[data['Title'] == 'Miss', 'Title'] = 1,
    data.loc[data['Title'] == 'Mrs', 'Title'] = 2
    data.loc[data['Title'] == 'Master', 'Title'] = 3,
    data.loc[data['Title'] == 'Dr', 'Title'] = 4,
    data.loc[data['Title'] == 'Rev', 'Title'] = 4,
    data.loc[data['Title'] == 'Col', 'Title'] = 4,
    data.loc[data['Title'] == 'Major', 'Title'] = 4,
    data.loc[data['Title'] == 'Sir', 'Title'] = 4,
    data.loc[data['Title'] == 'Countess', 'Title'] = 4,
    data.loc[data['Title'] == 'Don', 'Title'] = 4,
    data.loc[data['Title'] == 'Jonkheer', 'Title'] = 4,
    data.loc[data['Title'] == 'Lady', 'Title'] = 4,
    data.loc[data['Title'] == 'Ms', 'Title'] = 4,
    data.loc[data['Title'] == 'Capt', 'Title'] = 4,
    data.loc[data['Title'] == 'Mme', 'Title'] = 4
    data.loc[data['Title'] == 'Mlle', 'Title'] = 4
    data.loc[data['Title'] == 'Dona', 'Title'] = 4

In [395]:
new_all_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,3,"Braund, Mr. Owen Harris",0,2,1,0,A/5 21171,0.0,,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,3,1,0,PC 17599,2.0,C85,1,2
2,3,3,"Heikkinen, Miss. Laina",1,2,0,0,STON/O2. 3101282,0.0,,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,3,1,0,113803,1.0,C123,0,2
4,5,3,"Allen, Mr. William Henry",0,3,0,0,373450,0.0,,0,0


In [396]:
# 매핑을 마쳤으니 나머지 못쓸것같은 데이터들은 다 버린다.
drop_list = ['Ticket', 'SibSp', 'Parch', 'Name', 'Cabin', 'PassengerId']

In [397]:
final_train_data = new_all_data[new_all_data['PassengerId']<=891]
final_test_data = new_all_data[new_all_data['PassengerId']>891]

In [398]:
final_train_data = final_train_data.drop(drop_list, axis=1)
final_test_data = final_test_data.drop(drop_list, axis=1)

In [399]:
final_train_data.info()
final_test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    object 
 1   Sex       891 non-null    object 
 2   Age       891 non-null    object 
 3   Fare      891 non-null    float64
 4   Embarked  891 non-null    object 
 5   Title     891 non-null    object 
dtypes: float64(1), object(5)
memory usage: 48.7+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 891 to 1308
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    object 
 1   Sex       418 non-null    object 
 2   Age       418 non-null    object 
 3   Fare      418 non-null    float64
 4   Embarked  418 non-null    object 
 5   Title     418 non-null    object 
dtypes: float64(1), object(5)
memory usage: 22.9+ KB


In [400]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [401]:
label = train_data['Survived']

In [402]:
# knn은 hyperparameter를 바꿔가며 해봤으니 그저그런 서능
knn_clf = KNeighborsClassifier(n_neighbors = 8)
score = cross_val_score(knn_clf, final_train_data, label, cv=5, scoring='accuracy')
print(score)

[0.81005587 0.78651685 0.76404494 0.78651685 0.80898876]


KNeighborsClassifier(n_neighbors=8)

In [403]:
# estimator의 개수가 60개를 넘어가면서 점차 성능이 감소
rf_clf = RandomForestClassifier(n_estimators=60)
score = cross_val_score(rf_clf, final_train_data, label, cv=5, scoring='accuracy')
print(score)

[0.81564246 0.78651685 0.82022472 0.7752809  0.83146067]


RandomForestClassifier(n_estimators=60)

In [415]:
# C가 1~300까지 어느정도 일정하게 유지되다가 그이상 더 커지거나 작아지면 내려감
# C가 40 즈음에서 가장 좋은 성적
svm_clf = SVC(C=40)
score = cross_val_score(svm_clf, final_train_data, label, cv=5, scoring='accuracy')
print(score)

[0.83798883 0.80898876 0.83707865 0.78089888 0.86516854]


# 최종으로 사용할 모델 SVM!

In [411]:
svm_clf.fit(final_train_data, label)

SVC(C=40)

In [412]:
predictions = svm_clf.predict(final_test_data)

In [413]:
submission = pd.DataFrame({"PassengerId" : test_data['PassengerId'],
                          "Survived" : predictions})
submission.to_csv('submission.csv', index=False)

In [414]:
submission = pd.read_csv('submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [408]:
def makeTitle(data):
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)

In [409]:
def TrainAndTestModel(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    score = model.score(predictions)
    f1_score(y_test, predictions)

In [410]:
def makeTitle(data):
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)