In [28]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from collections import Counter

In [29]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', p=2):
        self.k = k
        self.distance_metric = distance_metric
        self.p = p  

    def fit(self, X, y):
        """Store the training data."""
        self.X_train = np.asarray(X, dtype=float)
        self.y_train = np.asarray(y, dtype=int)

    def compute_distance(self, X_train, x):
        """Compute distances between a given point and training data."""
        X_train = np.asarray(X_train, dtype=float)
        x = np.asarray(x, dtype=float)

        if self.distance_metric == 'euclidean':
            distances = np.linalg.norm(X_train - x, axis=1)
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X_train - x), axis=1)
        elif self.distance_metric == 'chebyshev':
            distances = np.max(np.abs(X_train - x), axis=1)
        elif self.distance_metric == 'minkowski':
            distances = np.sum(np.abs(X_train - x) ** self.p, axis=1) ** (1 / self.p)
        else:
            raise ValueError("Unknown distance metric")
        return distances

    def predict(self, X):
        """Predict class labels for the input data."""
        y_pred = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = distances.argsort()[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            
            # Weighted voting based on inverse distances
            k_nearest_distances = distances[k_indices]
            weights = 1 / (k_nearest_distances + 1e-5)
            weighted_vote = Counter()
            for label, weight in zip(k_nearest_labels, weights):
                weighted_vote[label] += weight
            y_pred.append(weighted_vote.most_common(1)[0][0])
        return np.array(y_pred)

    def predict_proba(self, X):
        """Predict probabilities for input data using weighted distances."""
        y_proba = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = distances.argsort()[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            k_nearest_distances = distances[k_indices]

            # Using inverse distances as weights
            weights = 1 / (k_nearest_distances + 1e-5)
            class_probs = np.zeros(len(np.unique(self.y_train)))
            for label, weight in zip(k_nearest_labels, weights):
                class_probs[label] += weight

            class_probs /= class_probs.sum()  # Normalize to probabilities
            y_proba.append(class_probs)
        return np.array(y_proba)

In [30]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    """Preprocess data by cleaning, scaling, and transforming categorical features."""
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    test_ids = test_data['id'].values

    combined_data = pd.concat([train_data, test_data], keys=['train', 'test'])
    combined_data.reset_index(level=0, inplace=True)
    combined_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)

    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    combined_data[numerical_features] = combined_data[numerical_features].fillna(combined_data[numerical_features].mean())

    scaler = StandardScaler()
    combined_data[numerical_features] = scaler.fit_transform(combined_data[numerical_features])

    train_data_processed = combined_data[combined_data['level_0'] == 'train'].drop('level_0', axis=1)
    test_data_processed = combined_data[combined_data['level_0'] == 'test'].drop('level_0', axis=1)
    y_train = train_data_processed['Exited'].values.astype(int)
    X_train = train_data_processed.drop(['id', 'Exited'], axis=1).values
    X_test = test_data_processed.drop(['id', 'Exited'], axis=1, errors='ignore').values

    return X_train, y_train, X_test, test_ids


In [31]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    """Cross-validate the KNN model and return AUC scores."""
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, val_index in skf.split(X, y):
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        knn.fit(X_train_fold, y_train_fold)
        y_val_proba = knn.predict_proba(X_val_fold)
        auc = roc_auc_score(y_val_fold, y_val_proba[:, 1])  # AUC for binary classification
        auc_scores.append(auc)

    return auc_scores

In [32]:
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')

# Hyperparameter tuning
k_values = [3, 5, 11, 22]
distance_metrics = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
p_values = [2, 3]

best_score = 0
best_params = {}

for k in k_values:
    for distance_metric in distance_metrics:
        if distance_metric == 'minkowski':
            for p in p_values:
                knn = KNN(k=k, distance_metric=distance_metric, p=p)
                cv_scores = cross_validate(X, y, knn)
                mean_score = np.mean(cv_scores)
                print(f"k={k}, distance_metric={distance_metric}, p={p}, CV Score={mean_score:.4f}")
                if mean_score > best_score:
                    best_score = mean_score
                    best_params = {'k': k, 'distance_metric': distance_metric, 'p': p}
        else:
            knn = KNN(k=k, distance_metric=distance_metric)
            cv_scores = cross_validate(X, y, knn)
            mean_score = np.mean(cv_scores)
            print(f"k={k}, distance_metric={distance_metric}, CV Score={mean_score:.4f}")
            if mean_score > best_score:
                best_score = mean_score
                best_params = {'k': k, 'distance_metric': distance_metric}

# Train final model with best hyperparameters
print("\nBest Parameters:", best_params)
print(f"Best CV Score: {best_score:.4f}")

if best_params['distance_metric'] == 'minkowski':
    knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'], p=best_params['p'])
else:
    knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])

knn.fit(X, y)
test_predictions = knn.predict_proba(X_test)[:, 1]  # Output probabilities for the positive class

                
# Save test predictions
submission = pd.DataFrame({
    'id': test_ids, 
    'Exited': test_predictions
})
submission.to_csv('submissions.csv', index=False)

# Read the saved CSV file to check for duplicates
submission_csv = pd.read_csv('submissions.csv')

# Check for duplicates
print("Number of submission IDs in 'submissions.csv':", len(submission_csv['id']))
print("Number of unique submission IDs in 'submissions.csv':", submission_csv['id'].nunique())

if len(submission_csv['id']) != submission_csv['id'].nunique():
    duplicates = submission_csv[submission_csv['id'].duplicated(keep=False)]
    print("Duplicate IDs found in 'submissions.csv':")
    print(duplicates)
else:
    print("No duplicate IDs found in 'submissions.csv'.")

k=3, distance_metric=euclidean, CV Score=0.8531
k=3, distance_metric=manhattan, CV Score=0.8498
k=3, distance_metric=chebyshev, CV Score=0.8517
k=3, distance_metric=minkowski, p=2, CV Score=0.8531
k=3, distance_metric=minkowski, p=3, CV Score=0.8537
k=5, distance_metric=euclidean, CV Score=0.8776
k=5, distance_metric=manhattan, CV Score=0.8784
k=5, distance_metric=chebyshev, CV Score=0.8770
k=5, distance_metric=minkowski, p=2, CV Score=0.8776
k=5, distance_metric=minkowski, p=3, CV Score=0.8770
k=11, distance_metric=euclidean, CV Score=0.9001
k=11, distance_metric=manhattan, CV Score=0.8996
k=11, distance_metric=chebyshev, CV Score=0.8997
k=11, distance_metric=minkowski, p=2, CV Score=0.9001
k=11, distance_metric=minkowski, p=3, CV Score=0.9008
k=22, distance_metric=euclidean, CV Score=0.9110
k=22, distance_metric=manhattan, CV Score=0.9096
k=22, distance_metric=chebyshev, CV Score=0.9070
k=22, distance_metric=minkowski, p=2, CV Score=0.9110
k=22, distance_metric=minkowski, p=3, CV Sco