# **Imports**

In [None]:
import pandas as pd
from random import randint
import numpy as np
import math

# **Reading the data**

In [None]:
data = pd.read_csv('train.csv')
dl = len(data)
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
w_d = data.append(test)

# **Preprocessing**

In [None]:
def preprocessing(data):
  # Categorical columns to numeric columns!
  data.Ticket = data.Ticket.factorize()[0]
  data.Sex = data.Sex.factorize()[0]
  data.Cabin = data.Cabin.factorize()[0]
  data.Embarked = data.Embarked.factorize()[0]
  data.Name = data.Name.factorize()[0]

  data = pd.concat([data, pd.get_dummies(data.Embarked)], axis=1)
  
  # In Age columns, there are some NAN cells, so we've replaced them with the mean of column.
  data.Age = data.Age.fillna(data.Age.mean())
  for i in data.columns:
    if i!= 'Survived':
      data[i] = data[i].fillna(data[i].mean())

  s = list(data.columns)
  s.remove('Survived')
  s.remove('Embarked')
  s.append('Survived')
  
  data = data[s]
  data = data.reindex(s, axis=1)

  dn = np.array(data)
  return dn, s

In [None]:
def normalize(X):
  return (X - np.mean(X, axis=0)) / (np.amax(X, axis=0) - np.amin(X, axis=0))

In [None]:
w_d, s = preprocessing(w_d)
w_d = pd.DataFrame(w_d, columns=s)  
tt = s.copy()
tt.remove('Survived')
w_d[tt] = normalize(np.array(w_d[tt].values))

dn = w_d[w_d.index < dl]
test = w_d[w_d.index >= dl]
dn = np.array(dn)
test = np.array(test)

# **KNN Algorithm**

In [None]:
# calculate the Euclidean distance between two vectors(sqrt(sum((vec1 - vec2)^2))
def euclidean_distance(row1, row2):
  distance = 0
  for i in range(len(row1) - 1):
    distance += (row1[i] - row2[i]) * (row1[i] - row2[i])
  return math.sqrt(distance)

In [None]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
  distances = list()

  for train_row in train:
    dist = euclidean_distance(test_row, train_row)
    distances.append((train_row, dist))
  distances.sort(key=lambda tup: tup[1])

  neighbors = list()
  for i in range(num_neighbors):
  	neighbors.append(distances[i][0][-1])
   
  return neighbors

In [None]:
# This function calculates the accuracy of the model based on the true labels and the predicted ones.
def calc_accuracy(actual_predicted_values):
  accuracy = 0
  for i in actual_predicted_values:
    if i[0] == i[1]:
      accuracy += 1
  return accuracy / len(actual_predicted_values)

In [None]:
# This function calculates the majority label of the neighbors of input row, and returns a label which shows the label of input row. If more than the half of the neighbors in 0 class, so input row is also in 0 class, else it will be in 1 class.
def calc_majority(neighbors, k):
  pws = 0
  for i in neighbors:
    pws += i
  pws /= k
  return int(pws >= 0.5)

In [None]:
# This functioon takes a numpy array as train data, and an integer as K. Then calculates neighbors of every row, predicts a label for each row and at the end return accuracy of the model.
def train_KNN_algorithm(train, k = 3):
  actual_predicted_values = []
  for i in range(len(train)):
    current_neighbors = get_neighbors(np.delete(train, i, axis=0), train[i], k)
    predicted_label = calc_majority(current_neighbors, k)
    actual_predicted_values.append((train[i][-1], predicted_label))
  
  return calc_accuracy(actual_predicted_values)

In [None]:
# Here, the KNN algorithm is runned with some different K argument, and the best K is calculated.
def select_best_k(train, max_k, k_step = 2):
  t_a = []
  best_acc = -1
  best_k = 0
  for c_k in range(1, max_k, k_step):
    current_acc_train = train_KNN_algorithm(train, c_k)
    t_a.append(current_acc_train)
    if current_acc_train > best_acc:
      best_acc = current_acc_train
      best_k = c_k
    print('When K is : ', c_k, ' Accuracy is : ', current_acc_train)
  return best_k, t_a

In [None]:
n_k = 35
best_k, t_a = select_best_k(dn, 30)

When K is :  1  Accuracy is :  0.755331088664422
When K is :  3  Accuracy is :  0.7755331088664422
When K is :  5  Accuracy is :  0.7890011223344556
When K is :  7  Accuracy is :  0.7845117845117845
When K is :  9  Accuracy is :  0.7901234567901234
When K is :  11  Accuracy is :  0.7901234567901234
When K is :  13  Accuracy is :  0.7912457912457912
When K is :  15  Accuracy is :  0.7968574635241302
When K is :  17  Accuracy is :  0.7957351290684624
When K is :  19  Accuracy is :  0.797979797979798
When K is :  21  Accuracy is :  0.7957351290684624
When K is :  23  Accuracy is :  0.8047138047138047
When K is :  25  Accuracy is :  0.8024691358024691
When K is :  27  Accuracy is :  0.8058361391694725
When K is :  29  Accuracy is :  0.8103254769921436


In [None]:
print(best_k)

29


In [None]:
# This function predicts the label of an input record.
def KNN_algorithm(to_be_predicted, train, k = 4):
  current_neighbors = get_neighbors(to_be_predicted, train, k)
  
  return calc_majority(current_neighbors, k)

# **Prediction on the test data**

In [None]:
# This function takes a labeled data, unlabel data which is wanted to be predicted and K, then predicts the labels of test row.
def test_KNN_algorithm(train, val, k = 3):
  predicted_values = []
  for i in range(len(val)):
    current_neighbors = get_neighbors(train, val[i], k)
    predicted_label = calc_majority(current_neighbors, k)
    predicted_values.append(predicted_label)
  
  return predicted_values

In [None]:
p = test_KNN_algorithm(dn, test, best_k)
print(p)

[0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 