### Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [56]:
def load_data():
    test_df = pd.read_csv("./data/titanic/train.csv")
    train_df = pd.read_csv("./data/titanic/test.csv")
    return test_df , train_df

In [57]:
test_df , train_df = load_data()

In [58]:
def fill_null(test_df):
    # check the number of missing value in each column
    test_df.drop('Cabin', axis=1, inplace=True)
    test_df['Age'].fillna(test_df['Age'].mean(), inplace = True)
    test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace= True)
    return test_df

In [59]:
test_df = fill_null(test_df)

In [73]:
test_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,28,1,0,A/5 21171,7.25,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,52,1,0,PC 17599,71.2833,0
2,3,1,3,"Heikkinen, Miss. Laina",0,34,0,0,STON/O2. 3101282,7.925,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,48,1,0,113803,53.1,2
4,5,0,3,"Allen, Mr. William Henry",1,48,0,0,373450,8.05,2


In [74]:
def encode_data(test_df):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    cols = ['Sex','Age','Embarked', 'Name','Ticket']
    test_df[cols]=test_df[cols].apply(le.fit_transform)
    return test_df

In [75]:
test_df = encode_data(test_df)

### Splitting the data

In [98]:
from sklearn.feature_selection import SelectKBest, f_classif


def split_data(test_df):
    X = test_df.loc[:, test_df.columns != 'Survived']
    # X = test_df[['Pclass','Sex','Age','SibSp','Parch','Embarked','Fare']]
    y = test_df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)
    fs = SelectKBest(score_func=f_classif, k=3)  # type: ignore
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    for i in range(len(fs.scores_)):
	    print('Feature %d: %f' % (i, fs.scores_[i]))
    return X_train_fs, X_test_fs, y_train, y_test # type:ignore

In [99]:
X_train , X_test, y_train, y_test = split_data(test_df)

Feature 0: 0.005865
Feature 1: 86.867193
Feature 2: 1.159261
Feature 3: 308.777775
Feature 4: 3.311049
Feature 5: 2.337112
Feature 6: 5.735932
Feature 7: 17.025002
Feature 8: 55.063276
Feature 9: 16.182056


### Training the models

In [104]:
def model_train_result(X_train, X_test, y_train, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier()
    lr = LogisticRegression()
    knn.fit(X_train, y_train)
    lr.fit(X_train, y_train)
    knn_y_pred = knn.predict(X_test)
    lr_y_pred = lr.predict(X_test)
    print(f'Score of KNN is {accuracy_score(y_test,knn_y_pred,normalize=True)} and score of Logistic Regression is {accuracy_score(y_test,lr_y_pred)}')

In [105]:
model_train_result(X_train, X_test, y_train, y_test)

Score of KNN is 0.7597765363128491 and score of Logistic Regression is 0.7653631284916201
