In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Date PreProcessing fucntion

In [3]:
# Null 처리
def fillna(df) :
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    # df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna(method='bfill', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_feature(df):
    df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행.
# 문자열 데이터 -> 숫자
from sklearn import preprocessing
def format_feature(df):
    # df['Cabin'] = df['Cabin'].str[:1]
    features = ['Sex', 'Embarked']
    for feature in features :
        le = preprocessing.LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_feature(df):
    df = fillna(df)
    df = drop_feature(df)
    df = format_feature(df)
    return df

# Data Load

In [4]:
titanic_df = pd.read_csv(r'C:\jupyter_home\ML\titanic\train.csv')
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
titanic_df = pd.read_csv(r'C:\jupyter_home\ML\titanic\train.csv')
# test_titanic_df = pd.read_csv(r'C:\jupyter_home\ML\titanic\test.csv')
y_titanic_df = titanic_df['Survived']
x_titanic_df = titanic_df.drop('Survived', axis=1)

# 데이터 전처리
x_titanic_df = transform_feature(x_titanic_df)

# 테스트 데이터 셋 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

# Desicion Tree Clasifier

In [6]:
dt_clf = DecisionTreeClassifier(random_state=11)

dt_clf.fit(x_train, y_train)
dt_pred = dt_clf.predict(x_test)
print('Decision Tree Classifier 정확도 : {0:.4f}'.format(accuracy_score(y_test, dt_pred)))

Decision Tree Classifier 정확도 : 0.7821


# Random Forest Classifier

In [7]:
rf_clf = RandomForestClassifier(random_state=11)

rf_clf.fit(x_train, y_train)
rf_pred = rf_clf.predict(x_test)
print('Random Forest Classifier 정확도 : {0:.4f}'.format(accuracy_score(y_test, rf_pred)))

Random Forest Classifier 정확도 : 0.8436


# Logistic Regression

In [8]:
lr_clf = LogisticRegression(max_iter=200)

lr_clf.fit(x_train, y_train)
lr_pred = lr_clf.predict(x_test)
print('Logistic Regression 정확도 : {0:.4f}'.format(accuracy_score(y_test, lr_pred)))

Logistic Regression 정확도 : 0.8659


# KFOLD

In [9]:
from sklearn.model_selection import KFold

def exec_kfold(clf, folds=5):
    kfold = KFold(n_splits=folds)
    scores = []

    # KFold 교차 검증 수행
    for iter_count, (train_index, test_index) in enumerate(kfold.split(x_titanic_df)):
        # X_titanic_df 데이터에서 교차 검증별로 학습과 검증데이터를 가리키는 index 생성
        x_train, x_test = x_titanic_df.values[train_index], x_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]

        #학습, 예측, 정확도 계산
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        acc = accuracy_score(y_test, pred)
        scores.append(acc)
        print('교차 검증 {0} 정확도: {1:.4f}'.format(iter_count, acc))

    # 평균 정확도
    mean_score = np.mean(scores)
    print('평균 정확도 : {0:.4f}'.format(mean_score))

In [10]:
print('Decision Tree')
exec_kfold(dt_clf, folds=5)
print('Random Forest')
exec_kfold(rf_clf)
print('Logistic Regression')
exec_kfold(lr_clf)

Decision Tree
교차 검증 0 정확도: 0.7318
교차 검증 1 정확도: 0.7753
교차 검증 2 정확도: 0.7978
교차 검증 3 정확도: 0.7697
교차 검증 4 정확도: 0.7978
평균 정확도 : 0.7745
Random Forest
교차 검증 0 정확도: 0.7430
교차 검증 1 정확도: 0.8034
교차 검증 2 정확도: 0.8539
교차 검증 3 정확도: 0.7809
교차 검증 4 정확도: 0.8427
평균 정확도 : 0.8048
Logistic Regression
교차 검증 0 정확도: 0.7877
교차 검증 1 정확도: 0.7865
교차 검증 2 정확도: 0.7753
교차 검증 3 정확도: 0.7528
교차 검증 4 정확도: 0.8371
평균 정확도 : 0.7879


### DataSet이 작아서 교차검증이 효과적이지 않다

# GBM

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
import time
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(x_train, y_train)
gb_pred = gb_clf.predict(x_test)
gb_acc = accuracy_score(y_test, gb_pred)

print('GBM 정확도 : {0:.4f}'.format(gb_acc))
print('GBM 수행시간 : {0:.1f}초'.format(time.time()-start_time))

GBM 정확도 : 0.8715
GBM 수행시간 : 0.1초


# GridSearchCv

In [12]:
from sklearn.model_selection import GridSearchCV

# Decision Tree
parameters = {
    'max_depth' : [2, 3, 5, 10],
    'min_samples_split':[2, 3, 5],
    'min_samples_leaf': [1, 5, 8]
}

grid_dclf = GridSearchCV(dt_clf , param_grid=parameters , scoring='accuracy' , cv=5)
grid_dclf.fit(x_train , y_train)
print('Decision Tree')
print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

# GridSearchCV의 최적 하이퍼파라미터로 학습된 Estimator로 예측 및 평가 수행
dpredictions = best_dclf.predict(x_test)
accuracy = accuracy_score(y_test , dpredictions)
print('테스트 세트에서의 DecisionTreeClassifier 정확도 : {0:.4f}'.format(accuracy))

Decision Tree
GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.8062
테스트 세트에서의 DecisionTreeClassifier 정확도 : 0.8771


# 최고 정확도 - GridSearch CV
# 87.71%

---

# Data 전처리 추가
- Age Categroy
- min_max Scaler
- drop Fare columns

In [13]:
titanic_df = pd.read_csv(r'C:\jupyter_home\ML\titanic\train.csv')
y_titanic_df = titanic_df['Survived']
x_titanic_df = titanic_df.drop('Survived', axis=1)

# 데이터 전처리
x_titanic_df = transform_feature(x_titanic_df)

# 나이 카테고리화
bin_divider = [0,5,12,18,25,35,60,90]
bin_name =  ['Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Elderly']
x_titanic_df['age_cat'] = pd.cut(x=x_titanic_df['Age'],
                                 bins=bin_divider,
                                 labels=bin_name,
                                 include_lowest=True)

le = preprocessing.LabelEncoder()
le = le.fit(x_titanic_df['age_cat'])
x_titanic_df['age_cat'] = le.transform(x_titanic_df['age_cat'])
x_titanic_df.drop(['Age'], axis=1, inplace=True)


# 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_titanic_df)
x_titanic_df_scaled = scaler.transform(x_titanic_df)

# transform시 스케일 변환된 데이터 셋이 ndarray로 변환되어 다시 dataframe으로 변환.
x_titanic_df = pd.DataFrame(data=x_titanic_df_scaled,
                                 columns=['Pclass', 'Sex', 'Sibsp', 'Parch', 'Fare', 'Embarked', 'age_cat'])
y_titanic_df = y_titanic_df.astype('float64')

x_titanic_df

Unnamed: 0,Pclass,Sex,Sibsp,Parch,Fare,Embarked,age_cat
0,1.0,1.0,0.125,0.000000,0.014151,1.0,0.666667
1,0.0,0.0,0.125,0.000000,0.139136,0.0,0.000000
2,1.0,0.0,0.000,0.000000,0.015469,1.0,1.000000
3,0.0,0.0,0.125,0.000000,0.103644,1.0,1.000000
4,1.0,1.0,0.000,0.000000,0.015713,1.0,1.000000
...,...,...,...,...,...,...,...
886,0.5,1.0,0.000,0.000000,0.025374,1.0,1.000000
887,0.0,0.0,0.000,0.000000,0.058556,1.0,0.666667
888,1.0,0.0,0.125,0.333333,0.045771,1.0,1.000000
889,0.0,1.0,0.000,0.000000,0.058556,0.0,1.000000


In [14]:
x_titanic_df.drop(['Fare'], axis=1, inplace=True)

In [15]:
# 테스트 데이터 셋 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

In [16]:
dt_clf = DecisionTreeClassifier(random_state=11)

dt_clf.fit(x_train, y_train)
dt_pred = dt_clf.predict(x_test)
print('Decision Tree Classifier 정확도 : {0:.4f}'.format(accuracy_score(y_test, dt_pred)))

Decision Tree Classifier 정확도 : 0.8547


In [17]:
rf_clf = RandomForestClassifier(random_state=11)

rf_clf.fit(x_train, y_train)
rf_pred = rf_clf.predict(x_test)
print('Random Forest Classifier 정확도 : {0:.4f}'.format(accuracy_score(y_test, rf_pred)))

Random Forest Classifier 정확도 : 0.8715


In [18]:
lr_clf = LogisticRegression(max_iter=200)

lr_clf.fit(x_train, y_train)
lr_pred = lr_clf.predict(x_test)
print('Logistic Regression 정확도 : {0:.4f}'.format(accuracy_score(y_test, lr_pred)))

Logistic Regression 정확도 : 0.8436


In [19]:
from sklearn.model_selection import KFold

def exec_kfold(clf, folds=5):
    kfold = KFold(n_splits=folds)
    scores = []

    # KFold 교차 검증 수행
    for iter_count, (train_index, test_index) in enumerate(kfold.split(x_titanic_df)):
        # X_titanic_df 데이터에서 교차 검증별로 학습과 검증데이터를 가리키는 index 생성
        x_train, x_test = x_titanic_df.values[train_index], x_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]

        #학습, 예측, 정확도 계산
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        acc = accuracy_score(y_test, pred)
        scores.append(acc)
        print('교차 검증 {0} 정확도: {1:.4f}'.format(iter_count, acc))

    # 평균 정확도
    mean_score = np.mean(scores)
    print('평균 정확도 : {0:.4f}'.format(mean_score))

In [20]:
print('Decision Tree')
exec_kfold(dt_clf, folds=5)
print('Random Forest')
exec_kfold(rf_clf)
print('Logistic Regression')
exec_kfold(lr_clf)

Decision Tree
교차 검증 0 정확도: 0.7486
교차 검증 1 정확도: 0.7865
교차 검증 2 정확도: 0.7978
교차 검증 3 정확도: 0.7584
교차 검증 4 정확도: 0.8090
평균 정확도 : 0.7801
Random Forest
교차 검증 0 정확도: 0.7765
교차 검증 1 정확도: 0.7921
교차 검증 2 정확도: 0.7978
교차 검증 3 정확도: 0.7697
교차 검증 4 정확도: 0.8483
평균 정확도 : 0.7969
Logistic Regression
교차 검증 0 정확도: 0.7654
교차 검증 1 정확도: 0.8034
교차 검증 2 정확도: 0.7640
교차 검증 3 정확도: 0.7472
교차 검증 4 정확도: 0.8258
평균 정확도 : 0.7812


# GBM

In [21]:
from sklearn.ensemble import GradientBoostingClassifier
import time
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(x_train, y_train)
gb_pred = gb_clf.predict(x_test)
gb_acc = accuracy_score(y_test, gb_pred)

print('GBM 정확도 : {0:.4f}'.format(gb_acc))
print('GBM 수행시간 : {0:.1f}초'.format(time.time()-start_time))

GBM 정확도 : 0.8659
GBM 수행시간 : 0.1초


In [22]:
from sklearn.model_selection import GridSearchCV

# Decision Tree
parameters = {
    'max_depth' : [2, 3, 5, 10],
    'min_samples_split':[2, 3, 5],
    'min_samples_leaf': [1, 5, 8]
}

grid_dclf = GridSearchCV(dt_clf , param_grid=parameters , scoring='accuracy' , cv=5)
grid_dclf.fit(x_train , y_train)
print('Decision Tree')
print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

# GridSearchCV의 최적 하이퍼파라미터로 학습된 Estimator로 예측 및 평가 수행
dpredictions = best_dclf.predict(x_test)
accuracy = accuracy_score(y_test , dpredictions)
print('테스트 세트에서의 DecisionTreeClassifier 정확도 : {0:.4f}'.format(accuracy))

Decision Tree
GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.7964
테스트 세트에서의 DecisionTreeClassifier 정확도 : 0.8380


In [23]:
grid_cv = GridSearchCV(gb_clf, param_grid=parameters, n_jobs=-1, scoring='accuracy', cv=5)
grid_cv.fit(x_train, y_train)

print('GridSearchCV 최적 하이퍼 파라미터 :',grid_cv.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_cv.best_score_))
best_cv = grid_cv.best_estimator_

# GridSearchCV의 최적 하이퍼파라미터로 학습된 Estimator로 예측 및 평가 수행
pred = best_cv.predict(x_test)
accuracy = accuracy_score(y_test , pred)
print('테스트 세트에서의 GBM 정확도 : {0:.4f}'.format(accuracy))

GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.8076
테스트 세트에서의 GBM 정확도 : 0.8659
