In [5]:
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm



In [6]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for test_point in X:
            # Compute distances from the test point to all training points
            distances = [self.compute_distance(test_point, x_train) for x_train in self.X_train]
            
            # Get the indices of the k closest neighbors
            k_indices = np.argsort(distances)[:self.k]
            
            # Get the labels of the k closest neighbors
            k_neighbor_labels = [self.y_train[i] for i in k_indices]
            
            # Perform majority voting to get the most common label
            most_common_label = Counter(k_neighbor_labels).most_common(1)[0][0]
            
            predictions.append(most_common_label)
        
        return np.array(predictions)

    def compute_distance(self, X1, X2):

        X1 = np.array(X1)
        X2 = np.array(X2)
    
        assert X1.shape == X2.shape, f"Shape mismatch: {X1.shape} vs {X2.shape}"
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        else:
            return np.sum(np.abs(X1 - X2))


In [46]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)



    majority_class = train_data[train_data['Exited'] == 0]  # Customers who stayed
    minority_class = train_data[train_data['Exited'] == 1]  # Customers who exited

    # Oversample the minority class (sample with replacement)
    minority_oversampled = minority_class.sample(len(majority_class), replace=True)

    # Combine the oversampled minority  with the original majority class
    train_data_balanced = pd.concat([majority_class, minority_oversampled])

    train_data_balanced = train_data_balanced.sample(frac=1).reset_index(drop=True)

    
    
    # Handle categorical variables (e.g., 'Gender', 'Geography')
    train_data['Gender'] = train_data['Gender'].map({'Male': 1, 'Female': 0})
    test_data['Gender'] = test_data['Gender'].map({'Male': 1, 'Female': 0})

    # One-hot encoding for 'Geography' in both train and test datasets
    train_data = pd.get_dummies(train_data, columns=['Geography'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography'], drop_first=True)

    train_data = train_data.astype({col: 'int64' for col in train_data.select_dtypes('bool').columns})
    test_data = test_data.astype({col: 'int64' for col in test_data.select_dtypes('bool').columns})


    train_data = train_data.drop(['id','CustomerId', 'Surname' , 'HasCrCard' , 'Tenure' , 'CreditScore' , 'EstimatedSalary' , 'Geography_Spain'], axis=1)
    test_data = test_data.drop([ 'id', 'CustomerId', 'Surname', 'HasCrCard' , 'Tenure' , 'CreditScore' , 'EstimatedSalary' , 'Geography_Spain'], axis=1)
    
    # Split features and target in training data
    X_train = train_data.drop('Exited', axis=1)  # All columns except target
    y_train = train_data['Exited']  # Target column

    # Scale numerical features using Min-Max scaling
    numerical_columns = [ 'Age', 'Balance', 'NumOfProducts']
    
    # Apply Min-Max scaling on both train and test datasets
    X_train[numerical_columns] = (X_train[numerical_columns] - X_train[numerical_columns].min()) / (X_train[numerical_columns].max() - X_train[numerical_columns].min())
    test_data[numerical_columns] = (test_data[numerical_columns] - test_data[numerical_columns].min()) / (test_data[numerical_columns].max() - test_data[numerical_columns].min())
    
    return X_train, y_train, test_data

In [11]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits = 3):
    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    cv_scores = []

    # Use tqdm to wrap the loop and display progress
    for i in tqdm(range(n_splits), desc="Cross-Validation Progress"):
        val_indices = indices[i * fold_size:(i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])

        # Use .iloc for row indexing in pandas DataFrames
        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]

        knn.fit(X_train.values, y_train.values)  # Convert pandas DataFrame to numpy array
        predictions = knn.predict(X_val.values)  # Convert pandas DataFrame to numpy array

        accuracy = np.mean(predictions == y_val.values)  # Compare predictions with actual values
        cv_scores.append(accuracy)

    return cv_scores

In [45]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
#knn = KNN(k=19, distance_metric='euclidean')

# Perform cross-validation
#cv_scores = cross_validate(X, y, knn)

#print("Cross-validation scores:", cv_scores)


# TODO: hyperparamters tuning
#k_values = [49]
#best_k = None
#best_score = 0

#for k in k_values:
        #knn = KNN(k=k)
        #cv_scores = cross_validate(X, y, knn, n_splits=3)
        #mean_cv_score = np.mean(cv_scores)

        #print(f"K = {k}, Mean CV Accuracy: {mean_cv_score:.4f}")
        #if mean_cv_score > best_score:
            #best_score = mean_cv_score
            #best_k = k


#print(best_k , best_score)
print(X.dtypes)
print(X_test.dtypes)

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=19, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/path/of/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Gender                 int64
Age                  float64
Balance              float64
NumOfProducts        float64
IsActiveMember       float64
Geography_Germany      int64
dtype: object
Gender                 int64
Age                  float64
Balance              float64
NumOfProducts        float64
IsActiveMember       float64
Geography_Germany      int64
dtype: object


TypeError: unsupported operand type(s) for -: 'str' and 'str'