In [43]:
import numpy as np
import pandas as pd

In [44]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.features_train = np.array(X)
        self.labels_train = np.array(y)

    def predict(self, X):
        #batch processing for system memory reasons
        batch_size = 1000
        n_samples = X.shape[0]
        predictions = []
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_X = X[start:end]
            distances = self.compute_distance(batch_X)  # Compute distances in batches

            knearest = np.argpartition(distances, self.k, axis=1)[:, :self.k]  # Get the k nearest neighbors (indices)
            nearestLabels = self.labels_train[knearest].astype(int)  # Get the labels of the k-nearest neighbors

            # Majority vote to determine the prediction for each sample
            batch_predictions = np.array([np.bincount(labels).argmax() for labels in nearestLabels])
            predictions.extend(batch_predictions)

        return np.array(predictions)
    
    #predict method but with the probaility estimates not just the classification
    def predict_proba(self, X):
        X = np.array(X)
        distances = self.compute_distance(X)
        knearest = np.argsort(distances, axis=1)[:, :self.k]
        nearestLabels = self.labels_train[knearest]
            
        #probability
        prob_class_1 = np.sum(nearestLabels == 1, axis=1) / self.k
        probabilities = np.column_stack((1 - prob_class_1, prob_class_1))        
        return probabilities  # Return probabilities as an array
            
    def compute_distance(self, X):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            squared_diff = ((X[:, np.newaxis, :] - self.features_train[np.newaxis, :, :]) ** 2)
            return np.sqrt(squared_diff.sum(axis=2))

In [55]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    print("preprocess begins")
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    print("csv read")
    
    
    # Drop columns which don't contribute any meaningful info
    columns_to_drop = ['id', 'CustomerId', 'Surname']  # Add any other columns that are not useful for modeling
    train_data = train_data.drop(columns=columns_to_drop, axis=1)
    test_data = test_data.drop(columns=columns_to_drop, axis=1)
    
    
    X_train = train_data.drop('Exited', axis=1)  # drop label column
    y_train = train_data['Exited']
    X_test = test_data
    
    #turn categorical values into numerical ones
    X_train = pd.get_dummies(X_train, columns=['Geography', 'Gender'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['Geography', 'Gender'], drop_first=True)
    
    print("data processed")
    
    X_train_np = X_train.to_numpy()
    X_test_np = X_test.to_numpy()
        
    means = X_train_np.mean(axis=0)
    stds = X_train_np.std(axis=0)
    
    #normalize
    X_train_scaled = (X_train_np - means) /stds
    X_test_scaled = (X_test_np - means) /stds
    print("data normalized")
    return X_train_scaled, y_train , X_test_scaled 

preprocess begins
csv read
id 0.0032511003831268144
CustomerId -0.0038975954595392368


TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [46]:
# Define cross-validation function

#help function for the ROC AUC
def compute_roc_auc(y_true, y_probs):
    
    y_true = np.array(y_true)
    y_probs = np.array(y_probs)
    sorted_indices = np.argsort(y_probs)[::-1]
    y_true = y_true[sorted_indices]
    y_probs = y_probs[sorted_indices]
    tpr_list = []
    fpr_list = []
    n_pos = np.sum(y_true)  
    n_neg = len(y_true) - n_pos  
    tp = 0  
    fp = 0  
    for i in range(len(y_true)):
        if y_true[i] == 1: 
            tp += 1
        else:  
            fp += 1
        tpr = tp / n_pos 
        fpr = fp / n_neg  
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    auc = np.trapz(tpr_list, fpr_list)
    return auc

#cross validation
def cross_validate(X, y, knn, n_splits=5):
    #set up fold size and shuffle data
    n_samples = len(y)
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    fold_size = n_samples // n_splits
    auc_scores = []
    for i in range(n_splits):
        # Split data 
        start = i * fold_size
        end = start + fold_size
        test_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        X_train, y_train = X[train_indices], y[train_indices]
        X_test, y_test = X[test_indices], y[test_indices]
        
        # pass the data to the knn fit method
        knn.fit(X_train, y_train)
        
        # pass the data to the prediction method
        y_probs = knn.predict_proba(X_test)[:, 1]
        
        # get ROC AUC
        auc_score = compute_roc_auc(y_test, y_probs)
        auc_scores.append(auc_score)
    
    # Return the average AUC score across all folds
    return np.mean(auc_scores)

In [48]:
# Load and preprocess data
print("starting preprocess")
X, y, X_test = preprocess_data('train.csv', 'test.csv')

print("done with preprocess")

k_values = list(range(1, 26))

best_k = None
best_score = -1
print("Starting cross validation")
for k in k_values:
    # Create and evaluate model
    knn = KNN(k=k, distance_metric='euclidean')

    # Perform cross-validation
    cv_scores = cross_validate(X, y, knn)
    print("K:", k)
    print("Score:", cv_scores)
    #keep track of the best performing k val
    if cv_scores > best_score:
        best_score = cv_scores
        best_k = k

print(f"Optimal k value: {best_k} with cross-validation score: {best_score}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

starting preprocess
preprocess begins
csv read
data processed
data normalized
done with preprocess
Starting cross validation
K: 1
Score: 0.7556296645881573
K: 2
Score: 0.8157363497891282
K: 3
Score: 0.8467659881858364
K: 4
Score: 0.864544922060723
K: 5
Score: 0.866499201458623
K: 6
Score: 0.8779729060648493
K: 7
Score: 0.8834357798225849
K: 8
Score: 0.882680774307268
K: 9
Score: 0.8906144040113977
K: 10
Score: 0.8914701156901866
K: 11
Score: 0.8937834584341365
K: 12
Score: 0.8942002197792875
K: 13
Score: 0.895127179072151
K: 14
Score: 0.8975353410372131
K: 15
Score: 0.8955310201197942
K: 16
Score: 0.8977914445934132
K: 17
Score: 0.8970219261917904
K: 18
Score: 0.8998553569829646
K: 19
Score: 0.9015842467246473
K: 20
Score: 0.8990513873098644
K: 21
Score: 0.901711649794174
K: 22
Score: 0.9022981914144316
K: 23
Score: 0.9015310488330919
K: 24
Score: 0.9017119716859714
K: 25
Score: 0.9026482059635386
Optimal k value: 25 with cross-validation score: 0.9026482059635386
