In [214]:
import pandas  as pd
import numpy as np
np.random.seed(23)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

titanic_df = pd.read_csv('https://raw.githubusercontent.com/Kritsana135/Ml-Final/main/titanic.csv')

In [215]:
titanic_df.shape

(891, 12)

In [216]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Exploration
    ทำการตรวจสอบข้อมูลก่อนว่ามีลักษณะเป็นอย่างไร มี missing value หรือเปล่า 
    ซึ่งจากการตรวจสอบพบว่ามี Columns
        - Cabin
        - Embarked
        - Age

    มีสอง column ที่ไม่ใช่ตัวเลข คือ Sex และ Embarked

In [217]:
total = titanic_df.isnull().sum().sort_values(ascending=False)
print(total)

Cabin          687
Age            177
Embarked         2
Fare             0
Ticket           0
Parch            0
SibSp            0
Sex              0
Name             0
Pclass           0
Survived         0
PassengerId      0
dtype: int64


# Data Cleaning
    จะทำการ ดรอปข้อมูลที่คิดว่าไม่เกี่ยวข้อง คือ ***PassengerId***, ***Name*** **Ticket** และ ***Cabin*** 
สำหรับ แอทริบิวต์ Cabin(ห้องโดยสาร) นั้นน่าจะมีความสอดคล้องกับแอทริบิวต์ Fare(ค่าโดยสาร) และเนื่องจาก Cabin มี missing value ที่มากเกินไปจึงเลือกใช้  แอทริบิวต์ Fare ซึ่งมี missing value น้อยกว่า

In [218]:
drop_columns = ['Cabin', 'PassengerId', 'Ticket', 'Name']
titanic_df = titanic_df.drop(drop_columns, axis=1)

หลังจากนั้นเราจะเติม missing value ใน Columns ต่อไปนี้ ด้วยข้อมูลที่เหลืออยู่ในแต่ละ Column
* Age ใช้ Mean เพราะข้อมูลเป็นแบบ Interval/Ratio (skewed เล็กน้อย)
* Embarked ใช้ Mode เพราะข้อมูลเป็นแบบ Nominal

In [219]:
# Fill empty values in age column
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)
# df_test['Age'].fillna(df_test['Age'].median(), inplace=True)

# Fill empty data in embarked column
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)
# df_test['Embarked'].fillna(df_test['Embarked'].mode()[0], inplace=True)

เพื่อที่จะใช้ค่าที่ไม่ใช่ตัวเลขใน knn เราต้องแปลงมาเป็นตัวเลขก่อน

In [220]:
# change non-numerical value to numerical values
titanic_df['Sex'] = titanic_df['Sex'].map({'female': 0, 'male': 1}).astype(int)

embarked_dummies = pd.get_dummies(titanic_df['Embarked'] ,prefix="E")
titanic_df = pd.concat([titanic_df, embarked_dummies], axis=1)

#drop Embarked
titanic_df = titanic_df.drop(['Embarked'], axis=1)

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,E_C,E_Q,E_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


Scale Continuous Variables

In [221]:
from mlxtend.preprocessing import minmax_scaling
from sklearn.preprocessing import StandardScaler
def scale(X):
  # Normalized Data
  X_scaled = minmax_scaling(X, columns=X.columns)
  X_normalized = pd.DataFrame(X_scaled)
  scaler = StandardScaler()
  X_normalized = scaler.fit_transform(X_normalized)
  return X_normalized

Split Data

In [222]:
#split data to train and test
selected_feature = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'E_C', 'E_Q', 'E_S']
target = 'Survived'

y = titanic_df[target]
X = titanic_df[selected_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=23)
X_scale = scale(X)
X_train_scale, X_test_scale, y_train, y_test = train_test_split(X_scale, y, test_size=0.25,random_state=23)


Train && Test (Defalut parameter)

In [223]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier()
clf.fit(X_train, y_train) 
print("Accuract with out sclae : " ,clf.score(X_test, y_test))

Accuract with out sclae :  0.8026905829596412


In [224]:
clf = MLPClassifier()
clf.fit(X_train_scale, y_train) 



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [225]:
print("Accuract with  sclae : " ,clf.score(X_test_scale, y_test))

Accuract with  sclae :  0.8161434977578476


In [226]:
mlp_gs = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate':['constant', 'invscaling', 'adaptive'],
}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train_scale, y_train) # X is train samples and y is the corresponding labels



GridSearchCV(cv=3, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=100, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state...
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'activation': ['relu', 'tanh', 'logistic'],
                         'alpha': [0.0

In [227]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant', 'solver': 'adam'}


In [228]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
# Predict Output
y_predict = clf.predict(X_test_scale)

# Evolution Model
print("\nConfusion Matrix: ")
print(confusion_matrix(y_test, y_predict))
print("classification Report : ")
print(classification_report(y_test, y_predict))
print("Accuracy:", metrics.accuracy_score(y_test, y_predict))



Confusion Matrix: 
[[132  13]
 [ 32  46]]
classification Report : 
              precision    recall  f1-score   support

           0       0.80      0.91      0.85       145
           1       0.78      0.59      0.67        78

    accuracy                           0.80       223
   macro avg       0.79      0.75      0.76       223
weighted avg       0.80      0.80      0.79       223

Accuracy: 0.7982062780269058


Best Accuracy

Accuract with  sclae :  0.8161434977578476