In [30]:
import numpy as np
import pandas as pd

In [31]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for x in X:
            # Compute distances between x and all points in X_train
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            # Sort the distances and get the indices of k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]
            # Get the labels of the k nearest samples
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            # Majority vote to determine the predicted class
            predictions.append(np.argmax(np.bincount(k_nearest_labels)))
        return predictions

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        else:
            raise ValueError("Unsupported distance metric")

In [32]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Preprocess the data (dropping irrelevant columns, encoding, scaling)
    # Assume the preprocessing function has been completed as shown earlier
    from sklearn.preprocessing import StandardScaler, LabelEncoder

    # Drop irrelevant columns
    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # Handle categorical variables
    le_geography = LabelEncoder()
    le_gender = LabelEncoder()

    train_data['Geography'] = le_geography.fit_transform(train_data['Geography'])
    train_data['Gender'] = le_gender.fit_transform(train_data['Gender'])

    test_data['Geography'] = le_geography.transform(test_data['Geography'])
    test_data['Gender'] = le_gender.transform(test_data['Gender'])

    # Separate features and target variable for the train set
    X_train = train_data.drop(columns=['Exited'])
    y_train = train_data['Exited']

    X_test = test_data

    # Scale numerical features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, y_train, X_test_scaled



In [33]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    scores = []

    for i in range(n_splits):
        # Split the data into training and validation sets
        X_val = X[i*fold_size:(i+1)*fold_size]
        y_val = y[i*fold_size:(i+1)*fold_size]
        X_train = np.concatenate([X[:i*fold_size], X[(i+1)*fold_size:]], axis=0)
        y_train = np.concatenate([y[:i*fold_size], y[(i+1)*fold_size:]], axis=0)

        # Train and evaluate the model
        knn.fit(X_train, y_train)
        predictions = knn.predict(X_val)
        accuracy = np.mean(predictions == y_val)
        scores.append(accuracy)

    return scores


In [34]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('sample_submission.csv', index=False)


Cross-validation scores: [0.87, 0.8723333333333333, 0.8673333333333333, 0.8816666666666667, 0.8703333333333333]
