In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('titanic_dataset.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Cleaning data

In [3]:
embarked_missing = data[data['Embarked'].isna()].index
embarked_missing

Index([61, 829], dtype='int64')

In [4]:
embarked_input = data['Embarked'].mode()[0]
embarked_input

'S'

In [5]:
data['Embarked'] = data['Embarked'].fillna(embarked_input)
data['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

Dropping unused features

In [6]:
features = ['PassengerId', 'Name', 'Cabin', 'Fare']
newData = data.drop(features, axis=1)
newData

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Embarked
0,0,3,male,22.0,1,0,A/5 21171,S
1,1,1,female,38.0,1,0,PC 17599,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,S
3,1,1,female,35.0,1,0,113803,S
4,0,3,male,35.0,0,0,373450,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,211536,S
887,1,1,female,19.0,0,0,112053,S
888,0,3,female,,1,2,W./C. 6607,S
889,1,1,male,26.0,0,0,111369,C


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
newData['Sex'] = le.fit_transform(newData['Sex'])
newData

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Embarked
0,0,3,1,22.0,1,0,A/5 21171,S
1,1,1,0,38.0,1,0,PC 17599,C
2,1,3,0,26.0,0,0,STON/O2. 3101282,S
3,1,1,0,35.0,1,0,113803,S
4,0,3,1,35.0,0,0,373450,S
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,211536,S
887,1,1,0,19.0,0,0,112053,S
888,0,3,0,,1,2,W./C. 6607,S
889,1,1,1,26.0,0,0,111369,C


In [8]:
newData.drop('Ticket', axis=1, inplace=True)

In [9]:
embarked = newData.Embarked
newData.drop('Embarked', axis=1, inplace=True)

Imputer untuk missing value Age

In [10]:
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [11]:
from sklearn.ensemble import RandomForestRegressor

impute = IterativeImputer(max_iter=5)
newData = impute.fit_transform(newData)

In [12]:
newData

array([[ 0.        ,  3.        ,  1.        , 22.        ,  1.        ,
         0.        ],
       [ 1.        ,  1.        ,  0.        , 38.        ,  1.        ,
         0.        ],
       [ 1.        ,  3.        ,  0.        , 26.        ,  0.        ,
         0.        ],
       ...,
       [ 0.        ,  3.        ,  0.        , 23.24638617,  1.        ,
         2.        ],
       [ 1.        ,  1.        ,  1.        , 26.        ,  0.        ,
         0.        ],
       [ 0.        ,  3.        ,  1.        , 32.        ,  0.        ,
         0.        ]])

In [13]:
print(type(newData))

<class 'numpy.ndarray'>


In [14]:
data = pd.DataFrame(newData, columns=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch'])
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0.0,3.0,1.0,22.000000,1.0,0.0
1,1.0,1.0,0.0,38.000000,1.0,0.0
2,1.0,3.0,0.0,26.000000,0.0,0.0
3,1.0,1.0,0.0,35.000000,1.0,0.0
4,0.0,3.0,1.0,35.000000,0.0,0.0
...,...,...,...,...,...,...
886,0.0,2.0,1.0,27.000000,0.0,0.0
887,1.0,1.0,0.0,19.000000,0.0,0.0
888,0.0,3.0,0.0,23.246386,1.0,2.0
889,1.0,1.0,1.0,26.000000,0.0,0.0


In [15]:
data = data.join(embarked)
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0.0,3.0,1.0,22.000000,1.0,0.0,S
1,1.0,1.0,0.0,38.000000,1.0,0.0,C
2,1.0,3.0,0.0,26.000000,0.0,0.0,S
3,1.0,1.0,0.0,35.000000,1.0,0.0,S
4,0.0,3.0,1.0,35.000000,0.0,0.0,S
...,...,...,...,...,...,...,...
886,0.0,2.0,1.0,27.000000,0.0,0.0,S
887,1.0,1.0,0.0,19.000000,0.0,0.0,S
888,0.0,3.0,0.0,23.246386,1.0,2.0,S
889,1.0,1.0,1.0,26.000000,0.0,0.0,C


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    891 non-null    float64
 2   Sex       891 non-null    float64
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    float64
 5   Parch     891 non-null    float64
 6   Embarked  891 non-null    object 
dtypes: float64(6), object(1)
memory usage: 48.9+ KB


In [17]:
data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.322727,0.523008,0.381594
std,0.486592,0.836071,0.47799,13.688843,1.102743,0.806057
min,0.0,1.0,0.0,-5.076426,0.0,0.0
25%,0.0,2.0,0.0,21.0,0.0,0.0
50%,0.0,3.0,1.0,29.227487,0.0,0.0
75%,1.0,3.0,1.0,36.576223,1.0,0.0
max,1.0,3.0,1.0,80.0,8.0,6.0


In [18]:
minus = data[data['Age'] < 0].index
minus

Index([159, 180, 201, 324, 792, 846, 863], dtype='int64')

In [19]:
data = data.drop(minus, axis=0)
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0.0,3.0,1.0,22.000000,1.0,0.0,S
1,1.0,1.0,0.0,38.000000,1.0,0.0,C
2,1.0,3.0,0.0,26.000000,0.0,0.0,S
3,1.0,1.0,0.0,35.000000,1.0,0.0,S
4,0.0,3.0,1.0,35.000000,0.0,0.0,S
...,...,...,...,...,...,...,...
886,0.0,2.0,1.0,27.000000,0.0,0.0,S
887,1.0,1.0,0.0,19.000000,0.0,0.0,S
888,0.0,3.0,0.0,23.246386,1.0,2.0,S
889,1.0,1.0,1.0,26.000000,0.0,0.0,C


In [20]:
data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
count,884.0,884.0,884.0,884.0,884.0,884.0
mean,0.386878,2.303167,0.64819,29.595018,0.463801,0.368778
std,0.487311,0.837101,0.477805,13.394849,0.8826,0.79621
min,0.0,1.0,0.0,0.42,0.0,0.0
25%,0.0,2.0,0.0,21.785934,0.0,0.0
50%,0.0,3.0,1.0,29.227487,0.0,0.0
75%,1.0,3.0,1.0,36.576223,1.0,0.0
max,1.0,3.0,1.0,80.0,5.0,6.0


In [21]:
data['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [22]:
data['Embarked'] = le.fit_transform(data['Embarked'])
data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0.0,3.0,1.0,22.000000,1.0,0.0,2
1,1.0,1.0,0.0,38.000000,1.0,0.0,0
2,1.0,3.0,0.0,26.000000,0.0,0.0,2
3,1.0,1.0,0.0,35.000000,1.0,0.0,2
4,0.0,3.0,1.0,35.000000,0.0,0.0,2
...,...,...,...,...,...,...,...
886,0.0,2.0,1.0,27.000000,0.0,0.0,2
887,1.0,1.0,0.0,19.000000,0.0,0.0,2
888,0.0,3.0,0.0,23.246386,1.0,2.0,2
889,1.0,1.0,1.0,26.000000,0.0,0.0,0


Splitting dataset

In [23]:
X = data.drop('Survived', axis=1)
y = data['Survived']

Defining function for classification algorrithms

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split

In [25]:
def classification(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    knn = KNeighborsClassifier()
    dt = DecisionTreeClassifier()
    rf = RandomForestClassifier()
    svc = SVC()
    lr = LogisticRegression()
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    
    algo = [knn, dt, rf, svc, lr]
    algo_names = ['KNN', 'SVC', 'DecisionTree', 'RandomForest', 'LogisticRegression']
    columns = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    
    result = pd.DataFrame(columns=columns, index=algo_names)
    
    for al in algo:
        predict = al.fit(X_train, y_train).predict(X_test)
        
        accuracy.append(accuracy_score(y_test, predict))
        precision.append(precision_score(y_test, predict))
        recall.append(recall_score(y_test, predict, average='weighted'))
        f1.append(f1_score(y_test, predict, average='weighted'))
        
    result['Accuracy'] = accuracy
    result['Precision'] = precision
    result['Recall'] = recall
    result['F1 Score'] = f1
    
    return result.sort_values('F1 Score', ascending=False)

In [26]:
classification(X, y)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
DecisionTree,0.838346,0.848485,0.838346,0.836639
SVC,0.827068,0.823529,0.827068,0.825746
LogisticRegression,0.827068,0.836735,0.827068,0.825063
KNN,0.793233,0.776699,0.793233,0.791841
RandomForest,0.590226,0.6,0.590226,0.482809


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
classification(X_train, y_train)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
KNN,0.795699,0.807018,0.795699,0.78988
DecisionTree,0.768817,0.727273,0.768817,0.766469
LogisticRegression,0.768817,0.734375,0.768817,0.765651
SVC,0.763441,0.704225,0.763441,0.762836
RandomForest,0.634409,0.777778,0.634409,0.53208


In [28]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

In [29]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

In [30]:
print('Accuracy score from GradientBoostingClassifier:', accuracy)

Accuracy score from GradientBoostingClassifier: 0.8421052631578947
