### 1. Import important loibraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

### 2. Loading the dataset

In [3]:
# our dataset is in the same folder as my notebook/script
data = pd.read_csv("credit_card_data_headers.csv")

# Check first few rows
print(data.head())

  A1\tA2\tA3\tA8\tA9\tA10\tA11\tA12\tA14\tA15\tR1
0        1\t30.83\t0\t1.25\t1\t0\t1\t1\t202\t0\t1
1    0\t58.67\t4.46\t3.04\t1\t0\t6\t1\t43\t560\t1
2      0\t24.5\t0.5\t1.5\t1\t1\t0\t1\t280\t824\t1
3     1\t27.83\t1.54\t3.75\t1\t0\t5\t0\t100\t3\t1
4    1\t20.17\t5.625\t1.71\t1\t1\t0\t1\t120\t0\t1


In [25]:
# let's separate our data colmuns
data = pd.read_csv("credit_card_data_headers.csv", sep='\t')
# Check first few rows
print(data.head())

   A1     A2     A3    A8  A9  A10  A11  A12  A14  A15  R1
0   1  30.83  0.000  1.25   1    0    1    1  202    0   1
1   0  58.67  4.460  3.04   1    0    6    1   43  560   1
2   0  24.50  0.500  1.50   1    1    0    1  280  824   1
3   1  27.83  1.540  3.75   1    0    5    0  100    3   1
4   1  20.17  5.625  1.71   1    1    0    1  120    0   1


In [5]:
data.columns

Index(['A1', 'A2', 'A3', 'A8', 'A9', 'A10', 'A11', 'A12', 'A14', 'A15', 'R1'], dtype='object')

In [6]:
for col in data.columns:
    unique_vals = data[col].nunique()
    print(f"Column: {col} - Unique values: {unique_vals}")

Column: A1 - Unique values: 2
Column: A2 - Unique values: 341
Column: A3 - Unique values: 213
Column: A8 - Unique values: 131
Column: A9 - Unique values: 2
Column: A10 - Unique values: 2
Column: A11 - Unique values: 23
Column: A12 - Unique values: 2
Column: A14 - Unique values: 164
Column: A15 - Unique values: 230
Column: R1 - Unique values: 2


* From the unique values of the columns above, we can see that four predictor features are binary variables(A1,A9,A10,A12), six features are continuous variables(A2,A3,A8,A11,A14,A15), and the response variable is binary(R1).

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 654 entries, 0 to 653
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      654 non-null    int64  
 1   A2      654 non-null    float64
 2   A3      654 non-null    float64
 3   A8      654 non-null    float64
 4   A9      654 non-null    int64  
 5   A10     654 non-null    int64  
 6   A11     654 non-null    int64  
 7   A12     654 non-null    int64  
 8   A14     654 non-null    int64  
 9   A15     654 non-null    int64  
 10  R1      654 non-null    int64  
dtypes: float64(3), int64(8)
memory usage: 56.3 KB


In [27]:
data.describe()

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,R1
count,654.0,654.0,654.0,654.0,654.0,654.0,654.0,654.0,654.0,654.0,654.0
mean,0.689602,31.578349,4.830558,2.24169,0.535168,0.561162,2.498471,0.538226,180.084098,1012.730887,0.452599
std,0.46301,11.981789,5.023295,3.369197,0.499143,0.496625,4.965655,0.498918,168.315719,5249.32066,0.498129
min,0.0,13.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,22.58,1.04,0.165,0.0,0.0,0.0,0.0,70.75,0.0,0.0
50%,1.0,28.46,2.855,1.0,1.0,1.0,0.0,1.0,160.0,5.0,0.0
75%,1.0,38.25,7.4375,2.615,1.0,1.0,3.0,1.0,271.0,399.0,1.0
max,1.0,80.25,28.0,28.5,1.0,1.0,67.0,1.0,2000.0,100000.0,1.0


In [29]:
# checking the missing value
data.isnull().sum()

A1     0
A2     0
A3     0
A8     0
A9     0
A10    0
A11    0
A12    0
A14    0
A15    0
R1     0
dtype: int64

### 3. Separate features and target

In [30]:
X = data.iloc[:, :-1]  # predictor variables
y = data.iloc[:, -1]   # target (binary)

### 4. Split data into train/validation/test

In [None]:
# Split the data: 60% train, 40% temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Split temp into 50% validation, 50% test → 20% each of total
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Train size:", X_train.shape[0])
print("Validation size:", X_val.shape[0])
print("Test size:", X_test.shape[0])


Train size: 392
Validation size: 131
Test size: 131


### 5. Standardize Continuous Features (important for KNN)

In [36]:
# Continuous features
cont_features = ['A2','A3','A8','A11','A14','A15']

scaler = StandardScaler()
X_train[cont_features] = scaler.fit_transform(X_train[cont_features])
X_val[cont_features] = scaler.transform(X_val[cont_features])
X_test[cont_features] = scaler.transform(X_test[cont_features])

### 6. K-Nearest Neighbors (KNN) with Cross-Validation

In [37]:
# Try different k values
k_values = range(1, 21)
cv_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# Best k
best_k = k_values[np.argmax(cv_scores)]
print("Best k:", best_k, "with CV accuracy:", max(cv_scores))


Best k: 7 with CV accuracy: 0.8316455696202532


### 7. Decision Tree (DT) with Cross-Validation

In [38]:
# Try different max_depth values
depth_values = range(1, 11)
dt_scores = []

for depth in depth_values:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    scores = cross_val_score(dt, X_train, y_train, cv=5, scoring='accuracy')
    dt_scores.append(scores.mean())

# Best depth
best_depth = depth_values[np.argmax(dt_scores)]
print("Best max_depth:", best_depth, "with CV accuracy:", max(dt_scores))


Best max_depth: 1 with CV accuracy: 0.8596559558584875


### 8. Evaluate on Validation Set

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Train best KNN
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train, y_train)
y_val_pred_knn = best_knn.predict(X_val)

# Train best DT
best_dt = DecisionTreeClassifier(max_depth=best_depth, random_state=42)
best_dt.fit(X_train, y_train)
y_val_pred_dt = best_dt.predict(X_val)

# Validation metrics
def print_metrics(y_true, y_pred, model_name):
    print(f"--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1-score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    
print_metrics(y_val, y_val_pred_knn, "KNN")
print_metrics(y_val, y_val_pred_dt, "Decision Tree")


--- KNN ---
Accuracy: 0.8396946564885496
Precision: 0.8421052631578947
Recall: 0.8
F1-score: 0.8205128205128205
Confusion Matrix:
 [[62  9]
 [12 48]]
--- Decision Tree ---
Accuracy: 0.8854961832061069
Precision: 0.8
Recall: 1.0
F1-score: 0.8888888888888888
Confusion Matrix:
 [[56 15]
 [ 0 60]]


#### **Observations**

1. **KNN (k=7)**

   * Good balance between precision and recall.
   * Misses some positive applications (false negatives = 12).

2. **Decision Tree**

   * Perfect recall (1.0) → all positive applications are detected.
   * Precision slightly lower → 15 false positives.
   * Best choice if **catching all positives is most important**.

3. **Accuracy**

   * Decision Tree has higher accuracy (0.885 vs 0.840), so overall performs slightly better on the validation set.

#### **Recommendation for Final Test Evaluation**

* Use the **Decision Tree** as the final model because it maximizes recall while maintaining good accuracy.
* Evaluate metrics on the **test set** (20% of the data) to report final performance.


### 9. Final Test Set Evaluation

In [42]:
# using Decision Tree
y_test_pred = best_dt.predict(X_test)
print_metrics(y_test, y_test_pred, "Decision Tree (Test Set)")

--- Decision Tree (Test Set) ---
Accuracy: 0.8473282442748091
Precision: 0.7746478873239436
Recall: 0.9322033898305084
F1-score: 0.8461538461538461
Confusion Matrix:
 [[56 16]
 [ 4 55]]
