In [23]:
import numpy as np
import pandas as pd

In [24]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        # Implement the predict method: Predict for multiple data points
        predictions = []
        for x in X:
            # Compute distances between x and all examples in the training set
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            # Sort by distance and return the indices of the first k neighbors
            k_indices = np.argsort(distances)[:self.k]
            # Extract the labels of the k nearest neighbor training samples
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            # Find the most common class label
            most_common_label = np.bincount(k_nearest_labels).argmax()
            predictions.append(most_common_label)
        return predictions

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2)**2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        

In [25]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    # Handle missing values only for numeric columns
    train_data.fillna(train_data.mean(numeric_only=True), inplace=True)
    test_data.fillna(test_data.mean(numeric_only=True), inplace=True)

    # Handle categorical variables by creating dummies
    train_data = pd.get_dummies(train_data, drop_first=True)
    test_data = pd.get_dummies(test_data, drop_first=True)

    # Get missing columns that are in train_data but not in test_data
    missing_cols = set(train_data.columns) - set(test_data.columns)
    # Add missing columns to test_data and fill with 0
    for col in missing_cols:
        test_data[col] = 0

    # Ensure test_data has the same column order as train_data
    test_data = test_data[train_data.columns]

    # Scale the features using StandardScaler
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()

    X_train = scaler.fit_transform(train_data.drop('Exited', axis=1))
    y_train = train_data['Exited'].values
    X_test = scaler.transform(test_data.drop('Exited', axis=1, errors='ignore'))
    return X_train, y_train, X_test


In [26]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    from sklearn.model_selection import KFold
    from sklearn.metrics import roc_auc_score

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_val_pred = knn.predict(X_val)

        auc = roc_auc_score(y_val, y_val_pred)
        auc_scores.append(auc)

    return auc_scores

In [28]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
best_k = 5

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)


# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0


Cross-validation scores: [0.6122342939388405, 0.5985904039034969, 0.6028073790466892, 0.6068004459308807, 0.600260272662075]
