# Problem 2.1


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# Load the data
data = pd.read_csv('UniversalBank.csv')

# Drop the ID and ZIP Code columns as they are not useful for prediction
data.drop(['ID', 'ZIP Code'], axis=1, inplace=True)

# Convert categorical variables into dummy variables
data = pd.get_dummies(data, columns=['Education'], drop_first=True)

# Split the data into features and target variable
X = data.drop('Personal Loan', axis=1)
y = data['Personal Loan']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training (60%) and validation (40%) sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.4, random_state=42)

# Part a:

In [5]:
# Define the specific customer's data
customer_data = pd.DataFrame({
    'Age': [40],
    'Experience': [10],
    'Income': [84],
    'Family': [2],
    'CCAvg': [2],
    'Mortgage': [0],
    'Securities Account': [0],
    'CD Account': [0],
    'Online': [1],
    'CreditCard': [1],
    'Education_2': [1],
    'Education_3': [0]
})

# Standardize the customer's data
customer_data_scaled = scaler.transform(customer_data)

# Perform k-NN classification with k=1
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
customer_prediction = knn.predict(customer_data_scaled)

print(f"Customer's loan acceptance prediction (k=1): {customer_prediction[0]}")

Customer's loan acceptance prediction (k=1): 0


# Part b:

In [6]:
from sklearn.metrics import accuracy_score

# Placeholder for best k and its accuracy
best_k = 1
best_accuracy = 0

# Try different values of k
for k in range(1, 21):  # Trying k from 1 to 20
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    val_predictions = knn.predict(X_val)
    accuracy = accuracy_score(y_val, val_predictions)
    print(f"k={k}, Validation Accuracy={accuracy}")
    
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy

print(f"Best k: {best_k} with Accuracy: {best_accuracy}")

k=1, Validation Accuracy=0.96
k=2, Validation Accuracy=0.947
k=3, Validation Accuracy=0.956
k=4, Validation Accuracy=0.9455
k=5, Validation Accuracy=0.953
k=6, Validation Accuracy=0.9425
k=7, Validation Accuracy=0.95
k=8, Validation Accuracy=0.9445
k=9, Validation Accuracy=0.9475
k=10, Validation Accuracy=0.9385
k=11, Validation Accuracy=0.9435
k=12, Validation Accuracy=0.938
k=13, Validation Accuracy=0.94
k=14, Validation Accuracy=0.9365
k=15, Validation Accuracy=0.9415
k=16, Validation Accuracy=0.937
k=17, Validation Accuracy=0.94
k=18, Validation Accuracy=0.9345
k=19, Validation Accuracy=0.937
k=20, Validation Accuracy=0.9315
Best k: 1 with Accuracy: 0.96


# Part c:

In [7]:
# Perform k-NN classification with the best k
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train, y_train)
customer_prediction_best_k = knn_best.predict(customer_data_scaled)

print(f"Customer's loan acceptance prediction (best k={best_k}): {customer_prediction_best_k[0]}")

Customer's loan acceptance prediction (best k=1): 0


# Problem 2.2