In [13]:
import numpy as np
import pandas as pd
from collections import Counter

In [14]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """Stores the training data."""
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict_proba(self, X):
        """Predicts the probability of the positive class."""
        probs = []
        for x in X:
            distances = self._compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            proba = sum(k_nearest_labels) / self.k  # Proportion of 1s in the nearest neighbors
            probs.append(proba)
        #[np.mean(self.y_train[np.argsort(self._compute_distance(self.X_train, x))[:self.k]]) for x in X]
        return probs

    def _compute_distance(self, X1, X2):
        """Computes the distance between input and training points."""
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

In [40]:
def standard_scaler(data, features):
    """Scales numeric features to have mean 0 and variance 1."""
    data = data[features].to_numpy()
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return (data - mean) / std

def one_hot_encode(data, features):
    """Performs one-hot encoding on categorical features."""
    encoded_arrays = []
    for feature in features:
        unique_vals = data[feature].unique()
        mapping = {val: idx for idx, val in enumerate(unique_vals)}
        encoded = np.array([mapping[val] for val in data[feature]])
        one_hot = np.zeros((encoded.size, len(unique_vals)))
        one_hot[np.arange(encoded.size), encoded] = 1
        encoded_arrays.append(one_hot)
    return np.hstack(encoded_arrays)

def ordinal_encode(data, features):
    encoded_arrays = []
    for feature in features:
        unique_vals = [np.unique(feature[:, i]) for i in range(feature.shape[1])]
        encoded = np.zeros_like(feature, dtype=int)

        for i, categories in enumerate(unique_vals):
            cat_val = {cat: idx for idx, cat in enumerate(categories)}
            # Map unknown categories to the specified unknown_value
            encoded[:, i] = np.array([
                cat_val.get(i/x) for x in feature[:, i]
            ])
            encoded_arrays.append(encoded)
    return np.hstack(encoded_arrays)

def preprocess_data(train_path, test_path):
    # Load datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Define feature groups
    categorical_features = ['Geography', 'Gender']
    numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
    ordinal_features = ['NumOfProducts', 'HasCrCard', 'IsActiveMember']

    # Scale numeric features
    train_numeric = standard_scaler(train_data, numeric_features)
    test_numeric = standard_scaler(test_data, numeric_features)

    # One-hot encode categorical features
    train_categorical = one_hot_encode(train_data, categorical_features)
    test_categorical = one_hot_encode(test_data, categorical_features)
    # Scale ordinal features
    train_ordinal = standard_scaler(train_data, ordinal_features)
    test_ordinal = standard_scaler(test_data, ordinal_features)

    # Combine numeric and categorical features
    train_processed = np.hstack([train_numeric, train_categorical, train_ordinal])
    test_processed = np.hstack([test_numeric, test_categorical, test_ordinal])

    # Extract labels from train data
    train_labels = train_data['Exited'].values

    return train_processed, train_labels, test_processed

# Example usage
train_features, train_labels, test_features = preprocess_data(
    'C:/Users/cicil/Desktop/New folder/train.csv',
    'C:/Users/cicil/Desktop/New folder/test.csv'
)


In [26]:
def kfold_split(X, y, n_splits=5, random_state=42):
    """Manually perform K-Fold cross-validation split."""
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    fold_size = len(X) // n_splits

    for i in range(n_splits):
        test_indices = indices[i * fold_size:(i + 1) * fold_size]
        train_indices = np.setdiff1d(indices, test_indices)
        yield X[train_indices], X[test_indices], y[train_indices], y[test_indices]

def roc_auc_score(y_true, y_score):
    #scikit-learn binary_clf_curve + roc_auc
    # sort predicted scores in descending order
    y_true = np.asarray(y_true)
    y_score = np.asarray(y_score)
    y_sort = np.argsort(y_score)[::-1]
    y_true = y_true[y_sort.astype(int)]
    y_score = y_score[y_sort.astype(int)]

    graph_intervals = np.where(np.diff(y_score))[0]
    last_bar = np.array([y_true.size - 1])
    threshold= np.hstack((graph_intervals, last_bar))

    tps = np.cumsum(y_true)[threshold]
    fps = (1 + threshold) - tps

    # convert count to rate
    tpr = tps / tps[-1]
    fpr = fps / fps[-1]

    # compute AUC using the trapezoidal rule;
    # appending an extra 0 is just to ensure the length matches
    zero = np.array([0])
    tpr_diff = np.hstack((np.diff(tpr), zero))
    fpr_diff = np.hstack((np.diff(fpr), zero))
    auc = np.dot(tpr, fpr_diff) + np.dot(tpr_diff, fpr_diff) / 2
    return auc

def cross_validate(X, y, knn, n_splits=5):
    """Perform K-Fold cross-validation using the KNN model."""
    auc_scores = []

    # Use the manual K-Fold splitter
    for X_train, X_test, y_train, y_test in kfold_split(X, y, n_splits):
        knn.fit(X_train, y_train)  # Fit the model on training data

        # Predict probabilities for the test data
        y_proba = []
        for x in X_test:
            distances = knn._compute_distance(knn.X_train, x)
            k_indices = np.argsort(distances)[:knn.k]
            k_nearest_labels = knn.y_train[k_indices]
            proba = sum(k_nearest_labels) / knn.k  # Proportion of 1s in the nearest neighbors
            y_proba.append(proba)
            
        # Calculate ROC AUC score manually
        #score = roc_auc_score_numpy(y_test, y_proba)
        score = roc_auc_score(y_test, y_proba)
        auc_scores.append(score)

    return np.mean(auc_scores)

In [42]:
import time
# Load and preprocess data
X, y, X_test = preprocess_data(
    'C:/Users/cicil/Desktop/New folder/train.csv', 
    'C:/Users/cicil/Desktop/New folder/test.csv'
)

# Initialize variables to track the best hyperparameters
best_score = 0
best_k = 0
best_metric = ''

# Hyperparameter tuning loop
start_time = time.time()  # Track total time

for k in range(51, 102, 10):  # Test k values from 100 to 140
    for metric in ['euclidean']: #, 'manhattan']: #  # Test both distance metrics
        print(f"\nEvaluating: k={k}, metric={metric}")

        # Create KNN instance and cross-validate
        knn = KNN(k=k, distance_metric=metric)  
        cv_score = cross_validate(X, y, knn, n_splits=5)  

        print(f"CV Score for k={k}, metric={metric}: {cv_score:.4f}")

        # Update the best hyperparameters if the score improves
        if cv_score > best_score:
            best_score = cv_score
            best_k = k
            best_metric = metric

# Total time taken for hyperparameter tuning
elapsed_time = time.time() - start_time

# Output the best hyperparameters and score
print(f"\nBest KNN Parameters: k={best_k}, Metric={best_metric} with ROC AUC Score: {best_score:.4f}")
print(f"Hyperparameter tuning completed in {elapsed_time:.2f} seconds.")


Evaluating: k=51, metric=euclidean
CV Score for k=51, metric=euclidean: 0.9106

Evaluating: k=61, metric=euclidean
CV Score for k=61, metric=euclidean: 0.9109

Evaluating: k=71, metric=euclidean
CV Score for k=71, metric=euclidean: 0.9099

Evaluating: k=81, metric=euclidean
CV Score for k=81, metric=euclidean: 0.9095

Evaluating: k=91, metric=euclidean
CV Score for k=91, metric=euclidean: 0.9090

Evaluating: k=101, metric=euclidean
CV Score for k=101, metric=euclidean: 0.9083

Best KNN Parameters: k=61, Metric=euclidean with ROC AUC Score: 0.9109
Hyperparameter tuning completed in 134.65 seconds.


In [36]:
optimal_knn = KNN(k=best_k, distance_metric=best_metric)
optimal_knn.fit(X, y)

In [38]:
# Make predictions on the test set
test_predictions = optimal_knn.predict_proba(X_test)
print(test_predictions)
test_01 = [1 if tp > 0.5 else 0 for tp in test_predictions]
print(sum(test_01))
# Save test predictions
submission_df = pd.DataFrame({'id': pd.read_csv('C:/Users/cicil/Desktop/New folder/test.csv')['id'], 'Exited': test_01})
submission_df.to_csv('submissions_k51_01.csv', index=False)



[0.0, 0.0196078431372549, 0.11764705882352941, 0.19607843137254902, 0.3137254901960784, 0.0, 0.0784313725490196, 0.0196078431372549, 0.0, 0.0392156862745098, 0.21568627450980393, 0.13725490196078433, 0.0196078431372549, 0.23529411764705882, 0.058823529411764705, 0.0196078431372549, 0.0, 0.2549019607843137, 0.7843137254901961, 0.0, 0.0, 0.0, 0.0, 0.0392156862745098, 0.23529411764705882, 0.7058823529411765, 0.0196078431372549, 0.0784313725490196, 0.2549019607843137, 0.47058823529411764, 0.0196078431372549, 0.17647058823529413, 0.0196078431372549, 0.6666666666666666, 0.11764705882352941, 0.0392156862745098, 0.0196078431372549, 0.0392156862745098, 0.17647058823529413, 0.19607843137254902, 0.6666666666666666, 0.21568627450980393, 0.09803921568627451, 0.21568627450980393, 0.058823529411764705, 0.0, 0.45098039215686275, 0.0392156862745098, 0.0, 0.47058823529411764, 0.0, 0.0196078431372549, 0.9019607843137255, 0.058823529411764705, 0.21568627450980393, 0.43137254901960786, 0.0196078431372549, 