In [17]:
import pandas  as pd
import numpy as np
np.random.seed(23)

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif

titanic_df = pd.read_csv('https://raw.githubusercontent.com/Kritsana135/Ml-Final/main/titanic.csv')

In [18]:
titanic_df.shape

(891, 12)

In [19]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Exploration
    ทำการตรวจสอบข้อมูลก่อนว่ามีลักษณะเป็นอย่างไร มี missing value หรือเปล่า 
    ซึ่งจากการตรวจสอบพบว่ามี Columns
        - Cabin
        - Embarked
        - Age

    มีสอง column ที่ไม่ใช่ตัวเลข คือ Sex และ Embarked

In [20]:
total = titanic_df.isnull().sum().sort_values(ascending=False)
print(total)

Cabin          687
Age            177
Embarked         2
Fare             0
Ticket           0
Parch            0
SibSp            0
Sex              0
Name             0
Pclass           0
Survived         0
PassengerId      0
dtype: int64


# Data Cleaning
    จะทำการ ดรอปข้อมูลที่คิดว่าไม่เกี่ยวข้อง คือ ***PassengerId***, ***Name*** **Ticket** และ ***Cabin*** 
สำหรับ แอทริบิวต์ Cabin(ห้องโดยสาร) นั้นน่าจะมีความสอดคล้องกับแอทริบิวต์ Fare(ค่าโดยสาร) และเนื่องจาก Cabin มี missing value ที่มากเกินไปจึงเลือกใช้  แอทริบิวต์ Fare ซึ่งมี missing value น้อยกว่า

In [21]:
drop_columns = ['Cabin', 'PassengerId', 'Ticket', 'Name']
titanic_df = titanic_df.drop(drop_columns, axis=1)

หลังจากนั้นเราจะเติม missing value ใน Columns ต่อไปนี้ ด้วยข้อมูลที่เหลืออยู่ในแต่ละ Column
* Age ใช้ median เพราะข้อมูลเป็นแบบ Interval/Ratio (skewed เล็กน้อย)
* Embarked ใช้ Mode เพราะข้อมูลเป็นแบบ Nominal

In [22]:
# Fill empty values in age column
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)
# df_test['Age'].fillna(df_test['Age'].median(), inplace=True)

# Fill empty data in embarked column
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)
# df_test['Embarked'].fillna(df_test['Embarked'].mode()[0], inplace=True)

เพื่อที่จะใช้ค่าที่ไม่ใช่ตัวเลขใน knn เราต้องแปลงมาเป็นตัวเลขก่อน

In [23]:
# change non-numerical value to numerical values
titanic_df['Sex'] = titanic_df['Sex'].map({'female': 0, 'male': 1}).astype(int)

embarked_dummies = pd.get_dummies(titanic_df['Embarked'] ,prefix="E")
titanic_df = pd.concat([titanic_df, embarked_dummies], axis=1)

#drop Embarked
titanic_df = titanic_df.drop(['Embarked'], axis=1)

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,E_C,E_Q,E_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


Normalization

In [24]:
scaler = MinMaxScaler()
titanic_normalized = pd.DataFrame(scaler.fit_transform(titanic_df), columns = titanic_df.columns)
titanic_normalized.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,E_C,E_Q,E_S
0,0.0,1.0,1.0,0.271174,0.125,0.0,0.014151,0.0,0.0,1.0
1,1.0,0.0,0.0,0.472229,0.125,0.0,0.139136,1.0,0.0,0.0
2,1.0,1.0,0.0,0.321438,0.0,0.0,0.015469,0.0,0.0,1.0
3,1.0,0.0,0.0,0.434531,0.125,0.0,0.103644,0.0,0.0,1.0
4,0.0,1.0,1.0,0.434531,0.0,0.0,0.015713,0.0,0.0,1.0


Split Data

In [25]:
#split data to train and test
selected_feature = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'E_C', 'E_Q', 'E_S']
target = 'Survived'

y = titanic_normalized[target]
X = titanic_normalized[selected_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=23)

Train && Test (Defalut parameter)
- ไม่ Scale  Accuracy =  0.7309417040358744
- Scale      Accuracy =  0.7937219730941704

In [26]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
score = knn.score(X_test,y_test)
print("Accuracy = ",score)

Accuracy =  0.7937219730941704


Feature selection - SelectKBest

In [27]:
def selected_feature(n_k):
  kbest_feats= SelectKBest(f_classif, k=n_k)

  #get top 5 best features
  x_train_kbest= kbest_feats.fit_transform(X_train, y_train)
  selected_feats_kbest= pd.DataFrame(kbest_feats.inverse_transform(x_train_kbest), index= X_train.index, columns= X_train.columns)
  selected_cols_kbest= selected_feats_kbest.columns[selected_feats_kbest.var() != 0]

  #get development set that has the top 5 features
  x_devel_kbest= X_test[selected_cols_kbest]

  return selected_cols_kbest

In [28]:
def brute_select():
  for num_f in range(9):
    feature = selected_feature(num_f+1)
    X_train_t = X_train[feature]
    X_test_t = X_test[feature]
    knn.fit(X_train_t,y_train)
    score = knn.score(X_test_t,y_test)
    print("Accuracy = ",score)

brute_select()

Accuracy =  0.7982062780269058
Accuracy =  0.7937219730941704
Accuracy =  0.8071748878923767
Accuracy =  0.7937219730941704
Accuracy =  0.8026905829596412
Accuracy =  0.7892376681614349
Accuracy =  0.8071748878923767
Accuracy =  0.7937219730941704
Accuracy =  0.7937219730941704


ผลไม่ต่างกันเยอะ เลยใช้ feature ทั้งหมดน่าจะดีกว่า

Hyperparameter Tuning

In [29]:
def tune(X_in,y_in):
  #List Hyperparameters that we want to tune.
  leaf_size = list(range(1,50))
  n_neighbors = list(range(1,30))
  p=[1,2]
  #Convert to dictionary
  hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
  #Create new KNN object
  knn_2 = KNeighborsClassifier()
  #Use GridSearch
  clf = GridSearchCV(knn_2, hyperparameters, cv=10)
  #Fit the model
  best_model = clf.fit(X_in,y_in)
  #Print The value of best Hyperparameters
  print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
  print('Best p:', best_model.best_estimator_.get_params()['p'])
  print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])


# tune(X,y) 

Best leaf_size: 1
Best p: 2
Best n_neighbors: 28

In [30]:
knn = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=28)
knn.fit(X_train,y_train)
score = knn.score(X_test,y_test)
print("Accuracy = ",score)

Accuracy =  0.8026905829596412


Final parameter

In [31]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=1, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=28, p=2,
                     weights='uniform')

# Summary
Accuracy =  0.8026905829596412