In [None]:
# importing libraries, etc...

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

path = "https://raw.githubusercontent.com/LennardVaarten/ML-Workshops/main/data/"

# **Loading and Viewing the Data**

In [None]:
# loading

titanic = pd.read_csv(path+"titanic.csv")

In [None]:
# viewing

titanic

In [None]:
# summarizing the features and target

titanic.describe()

# **Scaling the Features**

In [None]:
# scaling all the features, so that the values for each feature range from 0 to 1

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(titanic)

titanic[:] = scaler.transform(titanic)

titanic

# **Splitting into Training Set and Test Set**

In [None]:
from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(titanic.iloc[:,1:], 
                                                                            titanic.iloc[:,0],
                                                                            random_state=99)

In [None]:
features_train

In [None]:
target_train

In [None]:
features_test

In [None]:
target_test

# **Plotting Some Graphs**

In [None]:
# concatenating features and target to make plotting easier

train = pd.concat([features_train, target_train], 
                       axis=1)
test = pd.concat([features_test, target_test], 
                      axis=1)

train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

In [None]:
# plotting each feature against the target 

fig, axes = plt.subplots(3, figsize=(10,15))

for i in range(3):
    if len(train.iloc[:,i].value_counts()) > 3:
      sns.histplot(ax=axes[i],x=train.columns[i], data=train, hue="survived")
    else:
      sns.countplot(ax=axes[i], x=train.columns[i], data=train, hue='survived') 

In [None]:
# plotting class and sex against survival rates

for pclass in [0, 0.5, 1]:
  sns.catplot(x="sexMale", hue="survived", kind="count", aspect=1.33, data=train[train["passengerClass"]==pclass])

# **Fit, Predict, Evaluate**

In [None]:
# fitting k-NN on the training set

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 1).fit(features_train, target_train)

In [None]:
# what would our model predict for someone who had an average age, travelled 2nd class and was female?

print("Test prediction: {}".format(knn.predict(np.array([[0.5, 0.5, 0]]))[0]))

In [None]:
# our model's predictions for the entire test set

knn.predict(features_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 1).fit(features_train, target_train)

print("Training set score: {:.2f}".format(knn.score(features_train, target_train)))
print("Test set score: {:.2f}".format(knn.score(features_test, target_test)))

In [None]:
neighbors_settings = [k for k in range(1,14,2)]
scores = []

for k in neighbors_settings:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(features_train, target_train)
    scores.append(knn.score(features_test, target_test))
    
settingsPlot = sns.lineplot(x=neighbors_settings, y=scores)
settingsPlot.set_xticks(neighbors_settings)
settingsPlot.set_xlabel("k")
settingsPlot.set_ylabel("Score")

settingsPlot

## If We Hadn't Scaled The Features...

In [None]:
titanic_unscaled = pd.read_csv(path+"titanic.csv")
features_train_unscaled, features_test_unscaled, target_train, target_test = train_test_split(titanic_unscaled.iloc[:,1:], 
                                                                                              titanic_unscaled.iloc[:,0], 
                                                                                              random_state=99)

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(features_train_unscaled, target_train)
print("Training set score: {:.2f}".format(knn.score(features_train_unscaled, target_train)))
print("Test set score: {:.2f}".format(knn.score(features_test_unscaled, target_test)))

## If We Hadn't Made The Train/Test Split...

In [None]:
knn = KNeighborsClassifier(n_neighbors = 1).fit(titanic.iloc[:,1:], 
                                                titanic.iloc[:,0])
print("Training set score: {:.2f}".format(knn.score(titanic.iloc[:,1:], titanic.iloc[:,0])))