In [42]:
import numpy as np
import pandas as pd

In [43]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.features_train = np.array(X)
        self.labels_train = np.array(y)

    def predict(self, X):
        #batch processing for system memory reasons
        batch_size = 1000
        n_samples = X.shape[0]
        predictions = []
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_X = X[start:end]
            distances = self.compute_distance(batch_X)  # Compute distances in batches

            knearest = np.argpartition(distances, self.k, axis=1)[:, :self.k]  # Get the k nearest neighbors (indices)
            nearestLabels = self.labels_train[knearest].astype(int)  # Get the labels of the k-nearest neighbors

            # Majority vote to determine the prediction for each sample
            batch_predictions = np.array([np.bincount(labels).argmax() for labels in nearestLabels])
            predictions.extend(batch_predictions)

        return np.array(predictions)
    
    #predict method but with the probaility estimates not just the classification
    def predict_proba(self, X):
        X = np.array(X)
        distances = self.compute_distance(X)
        knearest = np.argsort(distances, axis=1)[:, :self.k]
        nearestLabels = self.labels_train[knearest]
            
        #probability
        prob_class_1 = np.sum(nearestLabels == 1, axis=1) / self.k
        probabilities = np.column_stack((1 - prob_class_1, prob_class_1))        
        return probabilities  # Return probabilities as an array
            
    def compute_distance(self, X):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            squared_diff = ((X[:, np.newaxis, :] - self.features_train[np.newaxis, :, :]) ** 2)
            return np.sqrt(squared_diff.sum(axis=2))

In [44]:
# Define data preprocessing function

# weight the features based on correlation
def get_feature_weights(X, y):
    n_features = X.shape[1]
    correlations = np.zeros(n_features)
    for i in range(n_features):
        corr = np.abs(np.corrcoef(X[:, i], y)[0, 1])
        correlations[i] = corr
    return correlations  

def preprocess_data(train_path, test_path):
    print("preprocess begins")
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    print("csv read")
    
    
    # Drop columns which don't contribute any meaningful info
    columns_to_drop = ['id', 'CustomerId', 'Surname', 'EstimatedSalary', 'HasCrCard', 'Tenure', 'CreditScore' ]  # Removing columns that have a low correlation based on a seperate script I ran to calc correlations
    train_data = train_data.drop(columns=columns_to_drop, axis=1)
    test_data = test_data.drop(columns=columns_to_drop, axis=1)
    
    
    X_train = train_data.drop('Exited', axis=1)  # drop label column
    y_train = train_data['Exited']
    X_test = test_data
    
    #turn categorical values into numerical ones
    X_train = pd.get_dummies(X_train, columns=['Geography', 'Gender'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['Geography', 'Gender'], drop_first=True)
    
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
        
    print("data processed and weighted")
    
    X_train_np = X_train.to_numpy()
    X_test_np = X_test.to_numpy()
    
    return X_train_np, y_train, X_test_np

In [45]:
# Define cross-validation function

#help function for the ROC AUC
def compute_roc_auc(y_true, y_probs):
    
    y_true = np.array(y_true)
    y_probs = np.array(y_probs)
    sorted_indices = np.argsort(y_probs)[::-1]
    y_true = y_true[sorted_indices]
    y_probs = y_probs[sorted_indices]
    tpr_list = []
    fpr_list = []
    n_pos = np.sum(y_true)  
    n_neg = len(y_true) - n_pos  
    tp = 0  
    fp = 0  
    for i in range(len(y_true)):
        if y_true[i] == 1: 
            tp += 1
        else:  
            fp += 1
        tpr = tp / n_pos 
        fpr = fp / n_neg  
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    auc = np.trapz(tpr_list, fpr_list)
    return auc

#balanced fold splitting
def stratified_k_fold_split(X, y, n_splits=5):
    y = np.array(y)
    unique_classes, y_indices = np.unique(y, return_inverse=True)
    class_counts = np.bincount(y_indices)
    folds = [[] for _ in range(n_splits)]

    for cls in unique_classes:
        cls_indices = np.where(y == cls)[0]
        np.random.shuffle(cls_indices)
        cls_fold_sizes = np.full(n_splits, len(cls_indices) // n_splits)
        cls_fold_sizes[:len(cls_indices) % n_splits] += 1
        start = 0
        for fold_idx, fold_size in enumerate(cls_fold_sizes):
            end = start + fold_size
            folds[fold_idx].extend(cls_indices[start:end])
            start = end

    return folds

#cross validation
def cross_validate(X, y, knn, n_splits=5):
    X = np.array(X)
    y = np.array(y)
    np.random.seed(42) 
    #set up folds using stratified folds to ensure correct splits
    folds = stratified_k_fold_split(X, y, n_splits)
    auc_scores = []
    for i in range(n_splits):
        test_indices = folds[i]
        train_indices = np.hstack([folds[j] for j in range(n_splits) if j != i])
        
        X_train_fold, y_train_fold = X[train_indices], y[train_indices]
        X_valid_fold, y_valid_fold = X[test_indices], y[test_indices]
        
        
        feature_weights = get_feature_weights(X_train_fold, y_train_fold)
        max_weight = np.max(feature_weights)
        feature_weights = feature_weights / max_weight
        
        #scale seperately for each fold to avoid accidently applying data which shouldn't be avalaible
        means = X_train_fold.mean(axis=0)
        stds = X_train_fold.std(axis=0)
        stds[stds == 0] = 1

        # Scale training and validation data
        X_train_fold_scaled = (X_train_fold - means) / stds
        X_valid_fold_scaled = (X_valid_fold - means) / stds
        
        #weight scaled data
        X_train_fold_weighted = X_train_fold_scaled * feature_weights
        X_valid_fold_weighted = X_valid_fold_scaled * feature_weights
        
        # Train and evaluate the model
        knn.fit(X_train_fold_scaled, y_train_fold)
        y_probs = knn.predict_proba(X_valid_fold_scaled)[:, 1]
        auc_score = compute_roc_auc(y_valid_fold, y_probs)
        auc_scores.append(auc_score)
    
    # Return the average AUC score across all folds
    return np.mean(auc_scores)

In [46]:
# Load and preprocess data
print("starting preprocess")
X, y, X_test = preprocess_data('train.csv', 'test.csv')

print("done with preprocess")

k_values = list(range(1, 21)) + list(range(21, 102, 2))

best_k = None
best_score = -1
print("Starting cross validation")
for k in k_values:
    # Create and evaluate model
    knn = KNN(k=k, distance_metric='euclidean')

    # Perform cross-validation
    cv_scores = cross_validate(X, y, knn)
    print("K:", k)
    print("Score:", cv_scores)
    #keep track of the best performing k val
    if cv_scores > best_score:
        best_score = cv_scores
        best_k = k

print(f"Optimal k value: {best_k} with cross-validation score: {best_score}")

# compute feature weights on the full training data
feature_weights = get_feature_weights(X, y)
max_weight = np.max(feature_weights)
feature_weights = feature_weights / max_weight  # normalize weights
means = X.mean(axis=0)
stds = X.std(axis=0)
stds[stds == 0] = 1

# scale and weight the features
X_scaled = ((X - means) / stds) * feature_weights
X_test_scaled = ((X_test - means) / stds) * feature_weights

# train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X_scaled, y)
test_predictions = knn.predict(X_test_scaled)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

starting preprocess
preprocess begins
csv read
data processed and weighted
done with preprocess
Starting cross validation
K: 1
Score: 0.7057555705063823
K: 2
Score: 0.8011159663473872
K: 3
Score: 0.8497125872796436
K: 4
Score: 0.8723657465626042
K: 5
Score: 0.8801135232694657
K: 6
Score: 0.8911781097440828
K: 7
Score: 0.9008754655405289
K: 8
Score: 0.9023092531464385
K: 9
Score: 0.9047624110979726
K: 10
Score: 0.9075603404354723
K: 11
Score: 0.90980393796474
K: 12
Score: 0.9115029404426209
K: 13
Score: 0.9129114707284112
K: 14
Score: 0.9130968206381762
K: 15
Score: 0.9138915474916379
K: 16
Score: 0.9145265133795771
K: 17
Score: 0.9155487769814211
K: 18
Score: 0.9165251436336405
K: 19
Score: 0.9177439876029473
K: 20
Score: 0.9176366666196897
K: 21
Score: 0.9189590426811419
K: 23
Score: 0.9200648583773166
K: 25
Score: 0.9209266883211178
K: 27
Score: 0.9211609265330336
K: 29
Score: 0.9226109504295337
K: 31
Score: 0.922155452471749
K: 33
Score: 0.921405512809099
K: 35
Score: 0.922553594583