In [1]:
import numpy as np
import pandas as pd

#### load and preprocessing data

In [2]:
train_data = pd.read_csv('train.csv')

train_data = train_data.replace(['female', 'male'], [0, 1])
train_data = train_data.replace(['S', 'C', 'Q'], [0, 1, 2])
train_data = train_data.fillna(0)

Y_train = train_data[['Survived']]
print(Y_train.shape)

X_train = train_data[['Pclass', 'Sex', 'Fare', 'Age', 'SibSp', 'Parch', 'Embarked']]

Y_train = np.array(Y_train)
Y_train = Y_train.reshape(-1)
X_train = np.array(X_train)

(891, 1)


In [3]:
test_data = pd.read_csv('test.csv')

test_data = test_data.replace(['female', 'male'], [0, 1])
test_data = test_data.replace(['S', 'C', 'Q'], [0, 1, 2])
test_data = test_data.fillna(0)

X_test = test_data[['Pclass', 'Sex', 'Fare', 'Age', 'SibSp', 'Parch', 'Embarked']]

test_label = pd.read_csv('gender_submission.csv')
Y_test = test_label[['Survived']]

Y_test = np.array(Y_test)
Y_test = Y_test.reshape(-1)
print(Y_test.reshape(-1).shape)
X_test = np.array(X_test)


(418,)


In [4]:
class KNearestNeighbore():
    def __init__(self, k):
        """
        Initialize the K Nearest Neighbor classifier.

        Args:
        k (int): Number of neighbors to consider.
        """
        self.k = k
    
    
    def fit(self, X_train, y_train):
        """
        Fit the model to the training data.

        Args:
        X_train (numpy.ndarray): Input features for training.
        y_train (numpy.ndarray): Output labels for training.
        """
        self.X_train = X_train
        self.y_train = y_train
        self.number_classes = len(np.unique(y_train))
    
    def nearNeighbors(self, x_test):
        """
        Find the indices of the k nearest neighbors for a given test sample.

        Args:
        x_test (numpy.ndarray): Input feature vector for testing.

        Returns:
        numpy.ndarray: Indices of the k nearest neighbors.
        """
        distance = np.sqrt(np.sum((x_test - self.X_train) ** 2, axis=1))
        near_neighbors = np.argsort(distance)[:self.k]
        
        return near_neighbors
    
    def predict(self, x_test):
        """
        Predict the label for a given test sample.

        Args:
        x_test (numpy.ndarray): Input feature vector for testing.

        Returns:
        int: Predicted label for the test sample.
        """
        near_neighbors = self.nearNeighbors(x_test)
        predict_label = np.argmax(np.bincount(self.y_train[near_neighbors]))
            
        return predict_label
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate the performance of the model on the test data.

        Args:
        X_test (numpy.ndarray): Input features for testing.
        y_test (numpy.ndarray): True output labels for testing.

        Returns:
        float: Accuracy of the model on the test data.
        """
        self.X_test = X_test
        self.Y_test = y_test
        y_pred = []     
        
        for i in range(len(X_test)):
            y_pred.append(self.predict(X_test[i]))
            
        true_label = np.count_nonzero(y_pred == y_test)
        accuracy = (true_label / len(y_pred))*100
        
        return accuracy


In [5]:
knn = KNearestNeighbore(5)
knn.fit(X_train, Y_train)
# print(X_test[0])
accuracy = knn.evaluate(X_test, Y_test)

print(accuracy)

65.55023923444976
