In [83]:
""" Import Statements """

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

def euclidean_d(p1, p2):
    d = np.sqrt(np.sum(p1-p2)**2)
    return d

def euclidean_d_scipy(p1, p2):
    d = distance.euclidean(p1, p2)
    return d     

def manhattan_d(p1, p2): 
    sum = 0
    n = len(p1)  
    # for each point, finding distance 
    # to rest of the point 
    for i in range(n): 
        for j in range(i+1,n): 
            sum += (abs(p1[i] - p1[j]) +
                        abs(p2[i] - p2[j])) 
      
    return sum

In [122]:
class K_Nearest_Neigh:
    # Create a constructor method, where we pass a value for k. We will assign the default value to 5
    # to match with sklearn default k value
    def __init__(self, k=5):
        self.k = k
   
   # Create fit method and pass training data
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    # Create predict method 
    def main_predict_func(self, X):
        predicted_classes = []
        for x in X:
            sample_predict = self.predict_per_sample_helper(x)
            predicted_classes.append(sample_predict)
        return np.array(predicted_classes)
        
    # Create a helper method, where we pass each sample value
    def predict_per_sample_helper(self, x):
        # Find the distances between the target and X_train values
        find_dist = []
        for x_train in self.X_train:
            euc_dist = euclidean_d(x, x_train) 
            find_dist.append(euc_dist)
        # Find the k-nearest neighbors samples 
        # We will sort the distances and return the indices of k neighbors 
        k_neigh_ind = np.argsort(find_dist)[0:self.k] 
        # Find the k-nearest neighbors labels 
        kn_class = [self.y_train[i] for i in k_neigh_ind]
        # Find the most common class, which new data will be assigned to 
        majority_votes = Counter(kn_class).most_common(1) 
        return majority_votes[0][0]

### Titanic

In [100]:
# Load data
titanic_df = pd.read_csv('titanic_clean.csv', index_col=0)
titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,has_cabin_number
1,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO",1
2,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",1
3,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",1
4,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",1
5,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",1


In [101]:
titanic_df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest',
       'has_cabin_number'],
      dtype='object')

In [102]:
titanic_df.dtypes

pclass              float64
survived            float64
name                 object
sex                  object
age                 float64
sibsp               float64
parch               float64
ticket               object
fare                float64
cabin                object
embarked             object
boat                 object
body                float64
home.dest            object
has_cabin_number      int64
dtype: object

In [103]:
# Remove unimportant columns and crate a copy of the dataframe with reduced number of columns
titanic_df.columns = ['Pclass', 'Survived', 'name', 'Sex', 'Age', 'Siblings_Spouse', 'parch', 'Ticket', 
                    'Fare', 'Cabin', 'embarked', 'boat', 'Body', 'Home_Dest', 'has_cabin_number']

titanic_df = titanic_df[['Pclass', 'Survived', 'Sex', 'Age', 'Siblings_Spouse', 'Fare', 'Home_Dest']]

titanic_df.head()

Unnamed: 0,Pclass,Survived,Sex,Age,Siblings_Spouse,Fare,Home_Dest
1,1.0,1.0,female,29.0,0.0,211.3375,"St Louis, MO"
2,1.0,1.0,male,0.9167,1.0,151.55,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,female,2.0,1.0,151.55,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,male,30.0,1.0,151.55,"Montreal, PQ / Chesterville, ON"
5,1.0,0.0,female,25.0,1.0,151.55,"Montreal, PQ / Chesterville, ON"


In [104]:
# Check for nulls (missing values)
income_df.isnull().sum()

Age               0
Education         0
Edu_num           0
Occupation        0
Race              0
Gender            0
Hours_per_week    0
Country           0
Income            0
dtype: int64

In [105]:
titanic_df = titanic_df.fillna(0)

In [106]:
titanic_df = titanic_df.astype({"Pclass":'int', "Survived":'int', "Age": 'int', "Siblings_Spouse":'int', "Fare":'int'})  
print(titanic_df.dtypes)
titanic_df.head(3)

Pclass              int32
Survived            int32
Sex                object
Age                 int32
Siblings_Spouse     int32
Fare                int32
Home_Dest          object
dtype: object


Unnamed: 0,Pclass,Survived,Sex,Age,Siblings_Spouse,Fare,Home_Dest
1,1,1,female,29,0,211,"St Louis, MO"
2,1,1,male,0,1,151,"Montreal, PQ / Chesterville, ON"
3,1,0,female,2,1,151,"Montreal, PQ / Chesterville, ON"


In [107]:
# Split data into training and testing
train, test = train_test_split(titanic_df, train_size=0.80, test_size=0.20, 
                              stratify=titanic_df['Survived'], random_state=42)

# Print shape of training and testing data
train.shape, test.shape

((1048, 7), (262, 7))

In [108]:
# Arrange data into X features matrix and y target vector 
target = 'Survived'
X_train = train.drop(columns=target)
y_train = train[target]
X_test = test.drop(columns=target)
y_test = test[target]

# Transform the categorical variables
encoder = ce.ordinal.OrdinalEncoder()
X_train_enc = encoder.fit_transform(X_train)
X_test_enc = encoder.transform(X_test)

X_train_enc.head(3)

Unnamed: 0,Pclass,Sex,Age,Siblings_Spouse,Fare,Home_Dest
143,1,1,46,0,79,1
1161,3,1,50,0,8,2
630,3,1,39,1,31,3


In [109]:
# Normalize the data
# Convert data into numpy array
X_train_enc = np.array(X_train_enc)
X_test_enc = np.array(X_test_enc)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_enc)
X_test = scaler.transform(X_test_enc)
y_train = np.array(y_train)
y_train = np.array(y_train)

X_train, X_test

(array([[-1.57314108, -0.73874698,  1.30190904, -0.48165448,  0.96833146,
         -0.76787569],
        [ 0.82489798, -0.73874698,  1.61527621, -0.48165448, -0.4840291 ,
         -0.75770798],
        [ 0.82489798, -0.73874698,  0.75351649,  0.50809826, -0.0135461 ,
         -0.74754026],
        ...,
        [ 0.82489798,  1.34245419, -1.59673732,  2.48760372, -0.09536923,
         -0.75770798],
        [-1.57314108, -0.73874698, -0.34326862, -0.48165448, -0.11582501,
          0.10654753],
        [ 0.82489798,  1.34245419, -0.42161041, -0.48165448, -0.4840291 ,
         -0.75770798]]),
 array([[ 0.82489798,  1.34245419, -0.02990145,  0.50809826, -0.15673658,
         -0.75770798],
        [ 0.82489798, -0.73874698, -0.02990145,  0.50809826, -0.19764814,
         -0.75770798],
        [ 0.82489798, -0.73874698,  0.83185828, -0.48165448, -0.3612944 ,
         -0.75770798],
        ...,
        [-1.57314108, -0.73874698, -0.02990145, -0.48165448, -0.64767536,
         -0.75770798],
  

## Accuracy score using K_Nearest_Neigh algorithm

In [124]:
# Create an object neigh for our algorithm K_Nearest_Neigh 
neigh = K_Nearest_Neigh(k=11)
# Fit and predict data 
neigh.fit(X_train, y_train)
pred = neigh.main_predict_func(X_test)
print(pred)
acc_score = np.sum(pred == y_test) / len(y_test)
print(f'Accuracy score using KNN algorithm: {acc_score}')

[0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 1 1
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 1 0 0 1 1 1 1 0 0 0 0 0 1
 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 1
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0
 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 0 1 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0
 0 0 0]
Accuracy score using KNN algorithm: 0.6297709923664122


## KNN accuracy score using Scikit-Learn library

In [126]:
print('KNN using Scikit-Learn library: /n')
               
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)

print(neigh.predict(X_test))

print(neigh.predict_proba(X_test))
accuracy_score = neigh.score(X_test, y_test)

print(f'Accuracy score using Scikit Learn KNN algorithm: {accuracy_score}')

KNN using Scikit-Learn library: /n
[1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0
 0 0 0 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1 0 1 0 1 0 0 1 0
 1 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1
 1 0 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 1 0
 1 1 0 0 1 0 0 1 1 0 1 1 0 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0
 1 1 1 0 1 1 0 1 1 0 0 0 1 1 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0 1 1 0 1 1 1 1
 0 1 0 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0 0 1 1 0 1 0 0
 0 0 0]
[[0.4 0.6]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [1.  0. ]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.  1. ]
 [1.  0. ]
 [0.2 0.8]
 [1.  0. ]
 [0.8 0.2]
 [0.2 0.8]
 [0.2 0.8]
 [1.  0. ]
 [0.  1. ]
 [1.  0. ]
 [0.8 0.2]
 [0.4 0.6]
 [0.2 0.8]
 [0.2 0.8]
 [1.  0. ]
 [0.8 0.2]
 [1.  0. ]
 [0.6 0.4]
 [0.4 0.6]
 [0.4 0.6]
 [1.  0. ]
 [0.4 0.6]
 [0.  1. ]
 [1.  0. ]
 [0.6 0.4]
 [1.  0. ]
 [0.6 0.4]
 [0.2 0.8]
 [1.  0. ]
 [1.  0. ]
 [0.8 0.2]
 [0