# Train the models

In [None]:
# Import our libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display # Allows the use of display() for DataFrames
%matplotlib inline


In [None]:
# Set a random seed
import random
random.seed(42) #請勿異動


In [None]:
# Load the dataset
full_data = pd.read_csv('titanic_data.csv') #請務必使用CSV檔，勿轉為其他副檔名

# Print the first few entries of the RMS Titanic data
display(full_data.head())


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**欄位敘述**

1. Survival - Survival (0 = No; 1 = Yes). Not included in test.csv file.
2. Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
3. Name - Name
4. Sex - Sex
5. Age - Age
6. Sibsp - Number of Siblings/Spouses Aboard
7. Parch - Number of Parents/Children Aboard
8. Ticket - Ticket Number
9. Fare - Passenger Fare
10. Cabin - Cabin
11. Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

In [None]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived'] #預測目標
features_raw = full_data.drop('Survived', axis = 1)

# Removing the names
features_no_name = features_raw.drop(['Name'], axis=1)

# One-hot encoding
features = pd.get_dummies(features_no_name)
features = features.fillna(0.0) #處理後的特徵值


In [None]:
# TODO
# 設定80%為training set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42) #請勿修改random state

print(X_train.shape, y_train.shape)


(712, 839) (712,)


In [None]:
# Import the classifier from sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC


In [None]:
# Instantiate a number of our models
dt_mod = DecisionTreeClassifier(random_state=42)
naive_bayes = MultinomialNB()
bag_mod = BaggingClassifier(n_estimators=200, random_state=42)
rf_mod = RandomForestClassifier(n_estimators=200, random_state=42)
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2, random_state=42)
svm_mod = SVC(random_state=42)

# 請勿異動上方參數


In [None]:
# TODO
# 使用training set 進行 Fit linear model 配適線性模型
# Fit each of the 6 models
dt_mod.fit(X_train, y_train)
naive_bayes.fit(X_train, y_train)
bag_mod.fit(X_train, y_train)
rf_mod.fit(X_train, y_train)
ada_mod.fit(X_train, y_train)
svm_mod.fit(X_train, y_train)


SVC(random_state=42)

# Test the models

In [None]:
# TODO
# 使用test set進行預測
# Make predictions using each of your models
preds_dt = dt_mod.predict(X_test)
preds_nb = naive_bayes.predict(X_test)
preds_bag = bag_mod.predict(X_test)
preds_rf = rf_mod.predict(X_test)
preds_ada = ada_mod.predict(X_test)
preds_svm = svm_mod.predict(X_test)


請參考下方程式，完成accuracy, precision, recall, f1的定義

* (preds==0).sum()
* (preds==1).sum()
* (actual==0).sum()
* (actual==1).sum()
* np.sum(preds == actual)/len(actual)
* len(np.intersect1d(np.where(preds==0), np.where(actual==0)))
* len(np.intersect1d(np.where(preds==0), np.where(actual==1)))
* len(np.intersect1d(np.where(preds==1), np.where(actual==0)))
* len(np.intersect1d(np.where(preds==1), np.where(actual==1)))


In [None]:
# TODO
def accuracy(actual, preds):
  
    return np.sum(preds == actual)/len(actual)


In [None]:
# TODO
# 預測為陽性的資料中，真的為陽性的比例
def precision(actual, preds):

    tp = len(np.intersect1d(np.where(preds==1), np.where(actual==1)))
    aa = (preds==1).sum()
    return tp/aa


In [None]:
# TODO
# 真的為陽性的資料中，被預測為陽性的比例
def recall(actual, preds):

    tp = len(np.intersect1d(np.where(preds==1), np.where(actual==1)))
    bb = tp+len(np.intersect1d(np.where(preds==0), np.where(actual==1)))
    return tp/bb


In [None]:
# TODO
# precision與recall的調和平均，可簡化為「2倍兩者乘積除以兩者之和」
def f1(preds, actual):

    tp = len(np.intersect1d(np.where(preds==1), np.where(actual==1)))
    aa = (preds==1).sum()
    bb = tp+len(np.intersect1d(np.where(preds==0), np.where(actual==1)))

    prec = tp/aa
    recall = tp/bb
    
    return 2*((prec*recall)/(prec+recall))


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def print_metrics(y_true, preds):

   print('Accuracy score: ', format(accuracy(y_true, preds)))
   print('True Accuracy: ', format(accuracy_score(y_true,preds)))
   print('Precision score: ', format(precision(y_true, preds)))
   print('True Precesion: ', format(precision_score(y_true,preds)))
   print('Recall score: ', format(recall(y_true, preds)))
   print('True Recall: ', format(recall_score(y_true,preds)))
   print('F1 score: ', format(f1(y_true, preds)))
   print('True F1: ', format(f1_score(y_true,preds)))
   print('\n')


In [None]:
# TODO
# Print scores
print_metrics(y_test, preds_dt)
print_metrics(y_test, preds_nb)
print_metrics(y_test, preds_bag)
print_metrics(y_test, preds_rf)
print_metrics(y_test, preds_ada)
print_metrics(y_test, preds_svm)



Accuracy score:  0.8156424581005587
True Accuracy:  0.8156424581005587
Precision score:  0.8059701492537313
True Precesion:  0.8059701492537313
Recall score:  0.7297297297297297
True Recall:  0.7297297297297297
F1 score:  0.7659574468085106
True F1:  0.7659574468085106


Accuracy score:  0.6927374301675978
True Accuracy:  0.6927374301675978
Precision score:  0.6461538461538462
True Precesion:  0.6461538461538462
Recall score:  0.5675675675675675
True Recall:  0.5675675675675675
F1 score:  0.6043165467625901
True F1:  0.6043165467625901


Accuracy score:  0.8100558659217877
True Accuracy:  0.8100558659217877
Precision score:  0.8225806451612904
True Precesion:  0.8225806451612904
Recall score:  0.6891891891891891
True Recall:  0.6891891891891891
F1 score:  0.75
True F1:  0.75


Accuracy score:  0.8212290502793296
True Accuracy:  0.8212290502793296
Precision score:  0.85
True Precesion:  0.85
Recall score:  0.6891891891891891
True Recall:  0.6891891891891891
F1 score:  0.7611940298507462