## ML Project

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv("Bank.csv")

print(df.info())
print(df.describe())
print(df.head())

In [None]:
non_null_check = df.notnull().all()
print("Non-null check for each column:")
print(non_null_check)

In [None]:
# Plot distribution of 'credit_score' for churned and non-churned customers
plt.figure(figsize=(12, 6))
sns.histplot(x='credit_score', data=df, hue='churn', kde=True, multiple="stack")
plt.title('Distribution of Credit Score by Churn')
plt.show()

In [None]:
# Plot countplot for 'gender' against 'churn'
plt.figure(figsize=(8, 5))
sns.countplot(x='gender', hue='churn', data=df)
plt.title('Gender Distribution by Churn')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()


In [None]:
# Plot distribution of 'age' for churned and non-churned customers
plt.figure(figsize=(12, 6))
sns.histplot(x='age', data=df, hue='churn', kde=True, multiple="stack")
plt.title('Distribution of Age by Churn')
plt.show()

In [None]:
# Plot distribution of 'balance' for churned and non-churned customers
plt.figure(figsize=(12, 6))
sns.histplot(x='balance', data=df, hue='churn', kde=True, multiple="stack")
plt.title('Distribution of Balance by Churn')
plt.show()

In [None]:
# Plot distribution of 'products_number' for churned and non-churned customers
plt.figure(figsize=(12, 6))
sns.countplot(x='products_number', hue='churn', data=df)
plt.title('Distribution of Products Number by Churn')
plt.show()

In [None]:
# Plot distribution of 'credit_card' for churned and non-churned customers
plt.figure(figsize=(8, 5))
sns.countplot(x='credit_card', hue='churn', data=df)
plt.title('Distribution of Credit Card Holders by Churn')
plt.show()

In [None]:
# Plot distribution of 'active_member' for churned and non-churned customers
plt.figure(figsize=(8, 5))
sns.countplot(x='active_member', hue='churn', data=df)
plt.title('Distribution of Active Members by Churn')
plt.show()

In [None]:
# Plot distribution of 'estimated_salary' for churned and non-churned customers
plt.figure(figsize=(12, 6))
sns.histplot(x='estimated_salary', data=df, hue='churn', kde=True, multiple="stack")
plt.title('Estimated Salary Distribution by Churn')
plt.xlabel('Estimated Salary')
plt.ylabel('Count')
plt.show()

In [None]:
numeric_cols = ['credit_score', 'age', 'balance', 'products_number', 'credit_card', 'active_member','estimated_salary', 'churn']
numeric_df = df[numeric_cols]

plt.figure(figsize=(12, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix with Churn')
plt.show()


In [None]:
# Calculate correlation with the target variable
correlation_with_churn = df[numeric_cols].corr()['churn'].sort_values(ascending=False)

# Plot correlation with the target variable
plt.figure(figsize=(10, 6))
correlation_with_churn.drop('churn').plot(kind='bar')
plt.title('Correlation of Numeric Features with Churn')
plt.show()


In [None]:
from scipy.stats import ttest_ind, chi2_contingency

features_to_analyze = ['credit_score', 'age', 'balance', 'products_number', 'estimated_salary', 'gender', 'credit_card', 'active_member']
results = []
for feature in features_to_analyze:
    if df[feature].dtype == 'float64' or df[feature].dtype == 'int64': 
        churned_numeric = df[df['churn'] == 1][feature]
        non_churned_numeric = df[df['churn'] == 0][feature]
        t_stat, p_value = ttest_ind(churned_numeric, non_churned_numeric)
        result = {'Feature': feature, 'Test': 't-test', 'Stat': t_stat, 'P-value': p_value}
        results.append(result)
    elif df[feature].dtype == 'object':
        contingency_table = pd.crosstab(df[feature], df['churn'])
        chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
        result = {'Feature': feature, 'Test': 'Chi-squared test', 'Stat': chi2_stat, 'P-value': p_value}
        results.append(result)
results_df = pd.DataFrame(results)
print(results_df)


## Binary logistic regeression

In [None]:
bank_dataset = "Bank.csv"
#implement age variable which is located at the 4th index
age=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[4]).astype(float)

In [None]:
age_vector=age
print(age_vector)

In [None]:
#get the target varible
churn=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[11]).astype(int)
churn_vector=churn
print(churn_vector)

In [None]:
activeMem=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[9]).astype(float)

In [None]:
def augment_data_matrix(data_input):
    data_matrix = np.c_[np.ones(data_input.shape[0]), data_input]
    return data_matrix

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def model_function(data_matrix, weights):
    return data_matrix @ weights.squeeze()

def binary_logistic_regression_cost_function(data_matrix, data_labels, weights):
    regression_outputs = sigmoid(model_function(data_matrix, weights))
    return np.mean(-data_labels * np.log(regression_outputs) - (1 - data_labels) * np.log(1 - regression_outputs))

def binary_logistic_regression_gradient(data_matrix, data_labels, weights):
    regression_outputs = sigmoid(model_function(data_matrix, weights))
    error = regression_outputs - data_labels
    gradient = data_matrix.T @ error / len(data_labels)
    return gradient

def gradient_descent(objective, gradient, initial_weights, step_size, no_of_iterations, print_output=None):
    objective_values = []
    weights = np.copy(initial_weights)

    for counter in range(no_of_iterations):
        weights -= step_size * gradient(weights)

        if print_output is not None and counter % print_output == 0:
            current_value = objective(weights)
            objective_values.append(current_value)
            print(f"Iteration {counter}, Objective Value: {current_value}")

    return weights, objective_values

def train_logistic_regression(data_matrix, data_labels, initial_weights, step_size, epochs=1000):
    objective = lambda weights: binary_logistic_regression_cost_function(data_matrix, data_labels, weights)
    gradient = lambda weights: binary_logistic_regression_gradient(data_matrix, data_labels, weights)

    trained_weights, _ = gradient_descent(objective, gradient, initial_weights, step_size, epochs, print_output=100)

    return trained_weights

def KFold_split(data_size, K):
    indexes = np.random.permutation(data_size)
    m, r = divmod(data_size, K)
    indexes_split = [indexes[i * m + min(i, r):(i + 1) * m + min(i + 1, r)] for i in range(K)]
    return indexes_split

def calculate_accuracy(y_true, y_pred):
    return np.mean(y_true == np.round(y_pred))

def KFold_cross_validation(data_matrix, data_outputs, K, model_evaluation, error_evaluation):
    indexes_split = KFold_split(len(data_matrix), K)
    total_error = 0
    total_accuracy = 0

    for i in range(K):
        indexes = np.concatenate([indexes_split[j] for j in range(K) if (j != i)])
        weights = model_evaluation(data_matrix[indexes], data_outputs[indexes])
        predictions = sigmoid(model_function(data_matrix[indexes_split[i]], weights))
        error = error_evaluation(data_matrix[indexes_split[i]], data_outputs[indexes_split[i]], weights)

        total_error += error
        total_accuracy += calculate_accuracy(data_outputs[indexes_split[i]], predictions)

    optimal_weights = model_evaluation(data_matrix, data_outputs)

    return optimal_weights, total_error / K, total_accuracy / K


data_matrix = augment_data_matrix(age_vector)


initial_weights = np.zeros(data_matrix.shape[1])


step_size = 3.9 * len(age_vector) / (np.linalg.norm(age_vector))**2


optimal_weights, average_validation_error, average_accuracy = KFold_cross_validation(data_matrix, churn_vector, K=5,
                                                           model_evaluation=lambda x, y: train_logistic_regression(x, y, initial_weights, step_size),
                                                           error_evaluation=binary_logistic_regression_cost_function)

print(f"Optimal Weights: {optimal_weights}")
print(f"Average Validation Error: {average_validation_error}")
print(f"Average Accuracy: {average_accuracy}")


In [None]:

data_matrix = augment_data_matrix(activeMem)

initial_weights = np.zeros(data_matrix.shape[1])


step_size = 3.9 * len(activeMem) / (np.linalg.norm(activeMem))**2

optimal_weights, average_validation_error, average_accuracy = KFold_cross_validation(data_matrix, churn_vector, K=5,
                                                           model_evaluation=lambda x, y: train_logistic_regression(x, y, initial_weights, step_size),
                                                           error_evaluation=binary_logistic_regression_cost_function)

print(f"Optimal Weights: {optimal_weights}")
print(f"Average Validation Error: {average_validation_error}")
print(f"Average Accuracy: {average_accuracy}")

In [None]:
data_inputs_2Variables=np.c_[age,activeMem]

In [None]:
def standardise(data_matrix):
    
    row_of_means = np.mean(data_matrix, axis=0)
    standardised_matrix = data_matrix - row_of_means
    row_of_stds = np.std(data_matrix, axis=0)
    
    return (standardised_matrix / row_of_stds), row_of_means, row_of_stds

In [None]:
data_inputs_2Variables,data_row_of_means, data_row_of_stds=standardise(data_inputs_2Variables)

In [None]:

data_matrix = augment_data_matrix(data_inputs_2Variables)

initial_weights = np.zeros(data_matrix.shape[1])


step_size = 3.9 * len(data_inputs_2Variables) / (np.linalg.norm(data_inputs_2Variables))**2


optimal_weights, average_validation_error, average_accuracy = KFold_cross_validation(data_matrix, churn_vector, K=5,
                                                           model_evaluation=lambda x, y: train_logistic_regression(x, y, initial_weights, step_size),
                                                           error_evaluation=binary_logistic_regression_cost_function)

print(f"Optimal Weights: {optimal_weights}")
print(f"Average Validation Error: {average_validation_error}")
print(f"Average Accuracy: {average_accuracy}")

### Test again for other variables

In [None]:
balance=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[6]).astype(float)
productsNum=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[7]).astype(float)
gender=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[3],\
                    converters={3:lambda x:0 if x==b'Male' else 1}).astype(int)

In [None]:
data_inputs=np.c_[age,balance,productsNum,activeMem,gender]

In [None]:
def standardize_matrix(matrix):
    row_of_means = np.nanmean(matrix, axis=1)
    row_of_stds = np.nanstd(matrix, axis=1)
    zero_std_mask = row_of_stds == 0
    row_of_stds[zero_std_mask] = 1 

    standardised_matrix = (matrix - row_of_means[:, np.newaxis]) / row_of_stds[:, np.newaxis]
    return standardised_matrix, row_of_means, row_of_stds


In [None]:
data_inputs,data_row_of_means, data_row_of_stds=standardise(data_inputs)

In [None]:

data_matrix = augment_data_matrix(data_inputs)

initial_weights = np.zeros(data_matrix.shape[1])

step_size = 3.9 * len(data_inputs) / (np.linalg.norm(data_inputs))**2

optimal_weights, average_validation_error, average_accuracy = KFold_cross_validation(data_matrix, churn_vector, K=5,
                                                           model_evaluation=lambda x, y: train_logistic_regression(x, y, initial_weights, step_size),
                                                           error_evaluation=binary_logistic_regression_cost_function)

print(f"Optimal Weights: {optimal_weights}")
print(f"Average Validation Error: {average_validation_error}")
print(f"Average Accuracy: {average_accuracy}")

### Do for all variables

In [None]:
creditScore=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[1]).astype(int)
estimatedSalary=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[10]).astype(int)




In [None]:
data_inputs_final=np.c_[age,balance,productsNum,activeMem,creditScore,estimatedSalary,gender]

In [None]:
def standardize_matrix(matrix):
    row_of_means = np.nanmean(matrix, axis=1)
    row_of_stds = np.nanstd(matrix, axis=1)
    zero_std_mask = row_of_stds == 0
    row_of_stds[zero_std_mask] = 1
    standardised_matrix = (matrix - row_of_means[:, np.newaxis]) / row_of_stds[:, np.newaxis]
    return standardised_matrix, row_of_means, row_of_stds


In [None]:
data_inputs_final,data_row_of_means, data_row_of_stds=standardise(data_inputs_final)

In [None]:

data_matrix = augment_data_matrix(data_inputs_final)


initial_weights = np.zeros(data_matrix.shape[1])


step_size = 3.9 * len(data_inputs_final) / (np.linalg.norm(data_inputs_final))**2

optimal_weights, average_validation_error, average_accuracy = KFold_cross_validation(data_matrix, churn_vector, K=5,
                                                           model_evaluation=lambda x, y: train_logistic_regression(x, y, initial_weights, step_size),
                                                           error_evaluation=binary_logistic_regression_cost_function)

print(f"Optimal Weights: {optimal_weights}")
print(f"Average Validation Error: {average_validation_error}")
print(f"Average Accuracy: {average_accuracy}")

### Now do nearset neihbour

In [None]:
def pairwise_distances(from_data, to_data):
    return np.sqrt(np.sum((from_data[:, np.newaxis] - to_data[np.newaxis, :])**2, axis=1))

def nearest_neighbour_classification(testing_inputs, training_inputs, training_outputs, no_of_neighbours):
    if testing_inputs.ndim == 1:
        testing_inputs = testing_inputs.reshape(-1, 1)

    distances = pairwise_distances(testing_inputs, training_inputs)
    no_of_classes = 1 + np.max(training_outputs)
    sorted_indices = np.argsort(distances, axis=1)

    no_of_inputs = len(testing_inputs)
    no_of_points = len(training_inputs)

    new_array_to_sort = np.broadcast_to(training_outputs, (no_of_inputs, no_of_points))
    sorted_labels = np.take_along_axis(new_array_to_sort, sorted_indices, 1)

    predicted_labels = np.zeros(no_of_inputs, int)
    for id_input in range(no_of_inputs):
        probability_labels = np.zeros(no_of_classes, float)

        for id_neighbor in range(0, no_of_neighbours):
            probability_labels[sorted_labels[id_input][id_neighbor]] += 1. / no_of_neighbours

        predicted_labels[id_input] = np.argmax(probability_labels)

    return predicted_labels

def classification_accuracy(estimated_labels, true_labels):
    equal_labels = estimated_labels == true_labels
    return np.mean(equal_labels)

def KFold_split(data_size, K):
    np.random.seed(123456789)
    indexes = np.random.permutation(data_size)
    m, r = divmod(data_size, K)
    indexes_split = [
        indexes[i * m + min(i, r):(i + 1) * m + min(i + 1, r)]
        for i in range(K)
    ]
    return indexes_split

def KFold_cross_validation_knn(data_inputs, data_outputs, K, labels_evaluation,
                               missclassification_evaluation, knn):
    data_size = len(data_inputs)
    indexes_split = KFold_split(data_size, K)

    average_accuracy = 0
    for i in range(K):
        training_indexes = np.concatenate([indexes_split[j] for j in range(K) if (j != i)])
        
        predicted_labels = labels_evaluation(data_inputs[indexes_split[i]],
                                             data_inputs[training_indexes],
                                             data_outputs[training_indexes], knn)
        
        accuracy = missclassification_evaluation(predicted_labels,
                                                data_outputs[indexes_split[i]])
        average_accuracy += accuracy / K
        
    error = 1. - average_accuracy
    return error, 1. - error 

def grid_search(objective, grid):
    values = np.array([])
    for point in grid:
        values = np.append(values, objective(point))
    return grid[np.argmin(values)]

def objective_fun(Knn, data_size, age_vector, churn_vector, K=5):
    indexes_split = KFold_split(data_size, K)

    average_accuracy = 0
    for i in range(K):
        training_indexes = np.concatenate([indexes_split[j] for j in range(K) if (j != i)])

        predicted_labels = nearest_neighbour_classification(age_vector[indexes_split[i]],
                                                            age_vector[training_indexes],
                                                            churn_vector[training_indexes], Knn)

        accuracy = classification_accuracy(predicted_labels, churn_vector[indexes_split[i]])
        average_accuracy += accuracy / K
        
    error = 1. - average_accuracy
    return error

def run_knn_with_cross_validation(data_size, age_vector, churn_vector, K=5):
    k_values = np.arange(1, 10)
    best_k = grid_search(lambda knn: objective_fun(knn, data_size, age_vector, churn_vector, K=5), k_values)
    cross_validation_errors = []
    accuracies = []

    for _ in range(10):
        error, accuracy = KFold_cross_validation_knn(age_vector, churn_vector, K, nearest_neighbour_classification,
                                                     classification_accuracy, best_k)
        cross_validation_errors.append(error)
        accuracies.append(accuracy)

    average_error = np.mean(cross_validation_errors)
    average_accuracy = np.mean(accuracies)

    return best_k, average_error, average_accuracy


data_size = len(age_vector)
best_k, average_error, average_accuracy = run_knn_with_cross_validation(data_size, age_vector, churn_vector, K=5)

print(f'The best value for K is: {best_k}')
print(f'Average Cross Validation Error: {average_error}, Average Accuracy: {average_accuracy}')


In [None]:
data_size = len(activeMem)

best_k, average_error, average_accuracy = run_knn_with_cross_validation(data_size, activeMem, churn_vector, K=5)

print(f'The best value for K is: {best_k}')
print(f'Average Cross Validation Error: {average_error}, Average Accuracy: {average_accuracy}')


In [None]:
def pairwise_distances(from_data, to_data):
    return np.sqrt(np.sum((from_data[:, np.newaxis] - to_data[np.newaxis, :])**2, axis=2))

def nearest_neighbour_classification(testing_inputs, training_inputs, training_outputs, no_of_neighbours):
    distances = pairwise_distances(testing_inputs, training_inputs)
    no_of_classes = 1 + np.max(training_outputs)
    sorted_indices = np.argsort(distances, axis=1)

    no_of_inputs = len(testing_inputs)
    no_of_points = len(training_inputs)

    new_array_to_sort = np.broadcast_to(training_outputs, (no_of_inputs, no_of_points))
    sorted_labels = np.take_along_axis(new_array_to_sort, sorted_indices, 1)

    predicted_labels = np.zeros(no_of_inputs, int)
    for id_input in range(no_of_inputs):
        probability_labels = np.zeros(no_of_classes, float)

        for id_neighbor in range(0, no_of_neighbours):
            probability_labels[sorted_labels[id_input][id_neighbor]] += 1. / no_of_neighbours

        predicted_labels[id_input] = np.argmax(probability_labels)

    return predicted_labels

def classification_accuracy(estimated_labels, true_labels):
    equal_labels = estimated_labels == true_labels
    return np.mean(equal_labels)

def KFold_split(data_size, K):
    np.random.seed(123456789)
    indexes = np.random.permutation(data_size)
    m, r = divmod(data_size, K)
    indexes_split = [
        indexes[i * m + min(i, r):(i + 1) * m + min(i + 1, r)]
        for i in range(K)
    ]
    return indexes_split

def KFold_cross_validation_knn(data_inputs, data_outputs, K, labels_evaluation, missclassification_evaluation, knn):
    data_size = len(data_inputs)
    indexes_split = KFold_split(data_size, K)

    average_accuracy = 0
    for i in range(K):
        training_indexes = np.concatenate([indexes_split[j] for j in range(K) if (j != i)])
        
        predicted_labels = labels_evaluation(data_inputs[indexes_split[i]],
                                             data_inputs[training_indexes],
                                             data_outputs[training_indexes], knn)
        
        accuracy = missclassification_evaluation(predicted_labels,
                                                data_outputs[indexes_split[i]])
        average_accuracy += accuracy / K
        
    error = 1. - average_accuracy
    return error, 1. - error  


def grid_search(objective, grid):
    values = np.array([])
    for point in grid:
        values = np.append(values, objective(point))
    return grid[np.argmin(values)]

def objective_fun(Knn, data_inputs, data_outputs, K=5):
    indexes_split = KFold_split(len(data_inputs), K)

    average_accuracy = 0
    for i in range(K):
        training_indexes = np.concatenate([indexes_split[j] for j in range(K) if (j != i)])

        predicted_labels = nearest_neighbour_classification(data_inputs[indexes_split[i]],
                                                            data_inputs[training_indexes],
                                                            data_outputs[training_indexes], Knn)

        accuracy = classification_accuracy(predicted_labels, data_outputs[indexes_split[i]])
        average_accuracy += accuracy / K
        
    error = 1. - average_accuracy
    return error

def run_knn_with_cross_validation(data_size, data_inputs, data_outputs, K=5):
    k_values = np.arange(1, 10)
    best_k = grid_search(lambda knn: objective_fun(knn, data_inputs, data_outputs, K=5), k_values)

    cross_validation_errors = []
    average_accuracies = []
    for k in k_values:
        error, accuracy = KFold_cross_validation_knn(data_inputs, data_outputs, K, nearest_neighbour_classification,
                                                     classification_accuracy, k)
        cross_validation_errors.append(error)
        average_accuracies.append(accuracy)

    return best_k, np.mean(cross_validation_errors), np.mean(average_accuracies)


data_inputs = data_inputs
data_outputs = churn_vector

best_k, average_error, average_accuracy = run_knn_with_cross_validation(len(data_inputs), data_inputs, data_outputs, K=5)

print(f'The best value for K is: {best_k}')
print(f'Average Cross Validation Error: {average_error}, Average Accuracy: {average_accuracy}')


In [None]:

data_inputs = data_inputs_final
data_outputs = churn_vector

best_k, average_error, average_accuracy = run_knn_with_cross_validation(len(data_inputs), data_inputs, data_outputs, K=5)

print(f'The best value for K is: {best_k}')
print(f'Average Cross Validation Error: {average_error}, Average Accuracy: {average_accuracy}')



### SVM

In [None]:
def augment_data_matrix(data_input):
    if data_input.ndim == 1:
        ones_column = np.ones((data_input.shape[0], 1))
        return np.concatenate((ones_column, data_input.reshape(-1, 1)), axis=1)
    else:
        ones_column = np.ones((data_input.shape[0], 1))
        return np.concatenate((ones_column, data_input), axis=1)

def svm_train(X, y, learning_rate, iterations, C):
    m, n = X.shape
    weights = np.zeros((n, 1))
    bias = 0

    for iteration in range(iterations):
        decision_function = np.dot(X, weights) + bias
        margins = 1 - y * decision_function
        loss = np.maximum(0, margins)
        regularization = 0.5 * np.sum(weights[1:]**2) / m  # excluding bias term
        total_loss = np.mean(loss) + C * regularization

        gradient = -np.dot(X.T, y * (margins > 0)) / m + C * np.vstack([0, weights[1:]])
        weights -= learning_rate * gradient
        bias -= learning_rate * np.sum(y * (margins > 0)) / m

        if iteration % 100 == 0:
            print(f"Iteration {iteration}, Loss: {total_loss}")

    return weights, bias

def svm_predict(X, weights, bias):
    return np.sign(np.dot(X, weights) + bias)

def calculate_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def kfold_split(data_size, K):
    indexes = np.random.permutation(data_size)
    m, r = divmod(data_size, K)
    indexes_split = [indexes[i * m + min(i, r):(i + 1) * m + min(i + 1, r)] for i in range(K)]
    return indexes_split

def kfold_cross_validation(X, y, k, svm_train, svm_predict, learning_rate, iterations, C):
    n = len(y)
    fold_size = n // k

    accuracies = []
    validation_errors = []

    for i in range(k):
        start, end = i * fold_size, (i + 1) * fold_size

        X_test, y_test = X[start:end], y[start:end]
        X_train, y_train = np.concatenate((X[:start], X[end:])), np.concatenate((y[:start], y[end:]))

        weights, bias = svm_train(X_train, y_train, learning_rate, epochs, C)
        predictions = svm_predict(X_test, weights, bias)

        accuracy = calculate_accuracy(y_test, predictions)
        validation_error = 1 - accuracy

        accuracies.append(accuracy)
        validation_errors.append(validation_error)

        print(f"Fold {i+1} Validation Error: {validation_error:.2%}")

    average_accuracy = np.mean(accuracies)
    average_validation_error = np.mean(validation_errors)

    print(f"\nAverage Accuracy: {average_accuracy:.2%}")
    print(f"Average Validation Error: {average_validation_error:.2%}")

    return average_accuracy, average_validation_error

def perform_grid_search(data_input, churn, learning_rates, C_values, k_folds, iterations):
    best_accuracy = 0.0
    best_validation_error = float('inf')
    best_learning_rate = 0.0
    best_C = 0.0

    for lr in learning_rates:
        for c_value in C_values:
            print(f"\nTraining SVM with learning rate={lr} and C={c_value}")

            data_matrix = augment_data_matrix(data_input)

            accuracy, validation_error = kfold_cross_validation(data_matrix, churn, k_folds, svm_train, svm_predict, lr, iterations, c_value)

            if validation_error < best_validation_error:
                best_accuracy = accuracy
                best_validation_error = validation_error
                best_learning_rate = lr
                best_C = c_value

    print(f"\nBest hyperparameters: Learning Rate={best_learning_rate}, C={best_C}")
    print(f"Best Accuracy: {best_accuracy:.2%}, Best Validation Error: {best_validation_error:.2%}")


data_inputs = age_vector
churn = np.genfromtxt(bank_dataset, delimiter=",", skip_header=1, usecols=[11]).astype(int)
churn = churn.reshape(-1, 1)
#from reaserch these are the basic parameters to use
learning_rates = [0.001, 0.01, 0.1]
C_values = [0.1, 1.0, 10.0]
k_folds = 5
iterations = 1000

perform_grid_search(data_inputs, churn, learning_rates, C_values, k_folds, iterations)


In [None]:
data_inputs = activeMem  
churn = np.genfromtxt(bank_dataset, delimiter=",", skip_header=1, usecols=[11]).astype(int)
churn = churn.reshape(-1, 1)

learning_rates = [0.001, 0.01, 0.1]
C_values = [0.1, 1.0, 10.0]
k_folds = 5
iterations = 1000

perform_grid_search(data_inputs, churn, learning_rates, C_values, k_folds, iterations)

In [None]:
data_inputs = data_inputs 
churn = np.genfromtxt(bank_dataset, delimiter=",", skip_header=1, usecols=[11]).astype(int)
churn = churn.reshape(-1, 1)

learning_rates = [0.001, 0.01, 0.1]
C_values = [0.1, 1.0, 10.0]
k_folds = 5
iterations = 1000

perform_grid_search(data_inputs, churn, learning_rates, C_values, k_folds, iterations)

In [None]:

def destandardize_matrix(matrix, mean, std):
    destandardized_matrix = matrix * std + mean
    return destandardized_matrix


In [None]:
X_destandardized = destandardize_matrix(data_inputs_final, data_row_of_means, data_row_of_stds)
data_inputs = X_destandardized
churn = np.genfromtxt(bank_dataset, delimiter=",", skip_header=1, usecols=[11]).astype(int)
churn = churn.reshape(-1, 1)

learning_rates = [0.001, 0.01, 0.1]
C_values = [0.1, 1.0, 10.0]
k_folds = 5
iterations = 1000

perform_grid_search(data_inputs, churn, learning_rates, C_values, k_folds, iterations)

### Naive Bayes classifier

In [None]:
#No need to add coloumn of ones
def gaussian_probability(x, mean, std):
    # Gaussian probability density function
    exponent = np.exp(-((x - mean)**2) / (2 * std**2))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

def train_naive_bayes(X, y):
    class_priors = {}
    unique_classes, counts = np.unique(y, return_counts=True)
    total_samples = len(y)
    for c, count in zip(unique_classes, counts):
        class_priors[c] = count / total_samples

    means = {}
    stds = {}
    for c in unique_classes:
        class_samples = X[y == c]
        means[c] = np.mean(class_samples, axis=0)
        stds[c] = np.std(class_samples, axis=0)

    return class_priors, means, stds

def predict_naive_bayes(X, class_priors, means, stds):
    predictions = []
    for x in X:
        class_scores = {}
        for c in class_priors:
            log_likelihood = np.sum(np.log(gaussian_probability(x, means[c], stds[c])))
            class_scores[c] = log_likelihood + np.log(class_priors[c])

        predicted_class = max(class_scores, key=class_scores.get)
        predictions.append(predicted_class)

    return np.array(predictions)

def kfold_split(data_size, K):
    indexes = np.random.permutation(data_size)
    m, r = divmod(data_size, K)
    indexes_split = [indexes[i * m + min(i, r):(i + 1) * m + min(i + 1, r)] for i in range(K)]
    return indexes_split

def kfold_cross_validation(X, y, k, train_func, predict_func):
    indexes_split = kfold_split(len(y), k)
    accuracies = []
    errors = []

    for i in range(k):
        train_indexes = np.concatenate([indexes_split[j] for j in range(k) if j != i])
        test_indexes = indexes_split[i]
        X_train, y_train = X[train_indexes], y[train_indexes]
        X_test, y_test = X[test_indexes], y[test_indexes]
        class_priors, means, stds = train_func(X_train, y_train)
        predictions = predict_func(X_test, class_priors, means, stds)
        accuracy = np.mean(predictions == y_test)
        accuracies.append(accuracy)
        error = 1 - accuracy
        errors.append(error)
        print(f"Fold {i + 1} Accuracy: {accuracy:.2%}, Validation Error: {error:.2%}")

    average_accuracy = np.mean(accuracies)
    average_error = np.mean(errors)

    print(f"\nAverage Accuracy (K-Fold): {average_accuracy:.2%}")
    print(f"Average Validation Error (K-Fold): {average_error:.2%}")

    return average_accuracy, average_error

age1 = np.genfromtxt(bank_dataset, delimiter=",", skip_header=1, usecols=[4]).astype(float)
data_inputs = age1
y = churn_vector
K = 5

average_accuracy, average_error = kfold_cross_validation(data_inputs, y, K, train_naive_bayes, predict_naive_bayes)
print(f'Overall Average Accuracy (K-Fold): {average_accuracy:.2%}')
print(f'Overall Average Validation Error (K-Fold): {average_error:.2%}')


In [None]:
activeMem1=np.genfromtxt(bank_dataset,delimiter=",",skip_header=1,usecols=[9]).astype(float)
data_inputs=activeMem1
y=churn_vector
K = 5

average_accuracy, average_error = kfold_cross_validation(data_inputs, y, K, train_naive_bayes, predict_naive_bayes)
print(f'Overall Average Accuracy (K-Fold): {average_accuracy:.2%}')
print(f'Overall Average Validation Error (K-Fold): {average_error:.2%}')

In [None]:
data_inputs = np.c_[age1,activeMem]
y=churn_vector
K = 5

average_accuracy, average_error = kfold_cross_validation(data_inputs, y, K, train_naive_bayes, predict_naive_bayes)
print(f'Overall Average Accuracy (K-Fold): {average_accuracy:.2%}')
print(f'Overall Average Validation Error (K-Fold): {average_error:.2%}')

In [None]:
data_inputs=np.c_[age,balance,productsNum,activeMem,gender]
y=churn_vector

K = 5

average_accuracy, average_error = kfold_cross_validation(data_inputs, y, K, train_naive_bayes, predict_naive_bayes)
print(f'Overall Average Accuracy (K-Fold): {average_accuracy:.2%}')
print(f'Overall Average Validation Error (K-Fold): {average_error:.2%}')

In [None]:
data_inputs_final=np.c_[age,balance,productsNum,activeMem,creditScore,estimatedSalary,gender]

K = 5

average_accuracy, average_error = kfold_cross_validation(data_inputs, y, K, train_naive_bayes, predict_naive_bayes)
print(f'Overall Average Accuracy (K-Fold): {average_accuracy:.2%}')
print(f'Overall Average Validation Error (K-Fold): {average_error:.2%}')