In [56]:
import seaborn as sns
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split #We need this to split the data
from sklearn.preprocessing import normalize #get the function needed to normalize our data.
from sklearn.neighbors import KNeighborsClassifier #the object class we need
from sklearn.metrics import confusion_matrix

In [57]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,


In [58]:
dummies = pd.get_dummies(df["Sex"])
df = pd.concat([df,dummies], axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin,female,male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,,0,1


In [59]:
df_subset = df[["Survived", "Pclass", "Age", "SibSp", "Parch", "female"]]
df_subset.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,female
0,0,3,22.0,1,0,0
1,1,1,38.0,1,0,1
2,1,3,26.0,0,0,1
3,1,1,35.0,1,0,1
4,0,3,35.0,0,0,0


In [60]:
df_subset = df_subset.dropna()
df_subset.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,female
0,0,3,22.0,1,0,0
1,1,1,38.0,1,0,1
2,1,3,26.0,0,0,1
3,1,1,35.0,1,0,1
4,0,3,35.0,0,0,0


In [61]:
X = df_subset[['Pclass', 'Age', 'SibSp', 'Parch', 'female']] #create the X matrix
X = normalize(X) #normalize the matrix to put everything on the same scale
y = df_subset['Survived'] #create the y-variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data, store it into different variables

In [69]:
knn = KNeighborsClassifier(n_neighbors=5) #create a KNN-classifier with 5 neighbors (default)
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) #calculate the fit on the test data

0.8325581395348837

In [70]:
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[117,  17],
       [ 19,  62]])

In [71]:
conf_matrix = pd.DataFrame(cm, index=['Not Survived', 'Survived'], columns = ['Not Survived_p', 'Survived_p']) 
conf_matrix

Unnamed: 0,Not Survived_p,Survived_p
Not Survived,117,17
Survived,19,62


n_neighbors=5 works the best, I get the highest score.

In [73]:
#how many of the cases are correctly predicted
accuracy = (117+62)/(117+19+17+62)
#How much of the predicted ‘survived’ actually survived?
precision = 62/(19+62)
#How much of the real survived is predicted as survived?
recall = 117/(117+19)

In [79]:
print(f"The accuracy of the predictions of survivors is {accuracy}. So {accuracy} *100 % of the cases is correctly predicted.")
print(f"The precision of the survivors is {precision}. So {precision} *100% of the predicted 'survived' actually survived.")
print(f"The recall of the survivors is {recall}. So {recall} *100% of the real survived is predicted as survived.")

The accuracy of the predictions of survivors is 0.8325581395348837. So 0.8325581395348837 *100 % of the cases is correctly predicted.
The precision of the survivors is 0.7654320987654321. So 0.7654320987654321 *100% of the predicted 'survived' actually survived.
The recall of the survivors is 0.8602941176470589. So 0.8602941176470589 *100% of the real survived is predicted as survived.
