In [55]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [56]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        return self

    def predict(self, X):
        predictions = []
        for sample in X:
            distances = self.compute_distance(sample, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            prediction = np.mean(k_nearest_labels)
            predictions.append(prediction)
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")


In [57]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle categorical variables
    categorical_cols = ['Geography', 'Gender']
    train_data = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
    test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

    # Separate features and target for train data
    X = train_data.drop(['Exited', 'id', 'CustomerId', 'Surname'], axis=1)
    y = train_data['Exited']

    # Prepare test data
    X_test = test_data.drop(['id', 'CustomerId', 'Surname'], axis=1)

    # Ensure test data has all columns present in train data
    for col in X.columns:
        if col not in X_test.columns:
            X_test[col] = 0

    # Align columns
    X_test = X_test[X.columns]

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)

    return X_scaled, y.values, X_test_scaled


In [58]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)

    return np.array(scores)

In [69]:
# Load and preprocess data
X, y, X_test = preprocess_data('datasets/train.csv', 'datasets/test.csv')

# Hyperparameter tuning
k_values = [21, 23, 25, 27]
distance_metrics = ['euclidean', 'manhattan']
best_score = 0
best_k = 0
best_metric = ''

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn)
        avg_score = np.mean(scores)
        print(f"k={k}, metric={metric}, Average ROC AUC: {avg_score}")
        if avg_score > best_score:
            best_score = avg_score
            best_k = k
            best_metric = metric

print(f"Best parameters: k={best_k}, metric={best_metric}")

# Train on full dataset with optimal hyperparameters
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
test_data = pd.read_csv('datasets/test.csv')
pd.DataFrame({'id': test_data['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

print("Predictions saved to submissions.csv")

k=21, metric=euclidean, Average ROC AUC: 0.9023181512474663
k=21, metric=manhattan, Average ROC AUC: 0.9026619698411809
k=23, metric=euclidean, Average ROC AUC: 0.9024231593471855
k=23, metric=manhattan, Average ROC AUC: 0.9027298893996537
k=25, metric=euclidean, Average ROC AUC: 0.9032117717050794
k=25, metric=manhattan, Average ROC AUC: 0.9040758489452555
k=27, metric=euclidean, Average ROC AUC: 0.9038545440350971
k=27, metric=manhattan, Average ROC AUC: 0.9049324856296138
Best parameters: k=27, metric=manhattan
Predictions saved to submissions.csv
