In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Set up

In [15]:
# import necessery packages 
import os 
import numpy as np 
from pathlib import Path
import itertools
import pandas as pd

### Set up parameters

In [16]:
# all the finetuning parameters
param_grid = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10],
    'weights': ['uniform', 'distance'],
    'p': [1,2],
    'metric': ['minkowski'],
    'init_log': [False], 
}

# # all the finetuning parameters
# param_grid = {
#     'n_neighbors': [1],
#     'p': ['gini'],
#     'weights': [None],
#     'metric': [2],
#     'init_log': [1],
#     'max_features': [None],
#     'bootstrap': [True, False]
# }

# Tạo tất cả các tổ hợp tham số
param_combinations = list(itertools.product(
    param_grid['n_neighbors'],
    param_grid['weights'],
    param_grid['p'],
    param_grid['metric'],
    param_grid['init_log'],
))

### Set up data

In [17]:
# Get data directories
data_folder = '../data'
train_dir = os.path.join(data_folder, 'train.csv')
test_dir = os.path.join(data_folder, 'test.csv')
val_dir = os.path.join(data_folder, 'val.csv')

In [18]:
# Load data
def load_data(data_dir): 
    df = pd.read_csv(data_dir)
    X = df.drop(columns=['TARGET']) 
    y = df['TARGET'] 
    return X, y

X_train, y_train = load_data(train_dir)
X_test, y_test = load_data(test_dir)
X_val, y_val = load_data(val_dir)

### Set up folder for logging

In [19]:
experiment_dir = Path('./output/KNN')
experiment_dir.mkdir(parents=True, exist_ok=True)

# KNN from scratch

In [20]:
# Set up system path - dont care
import sys
project_root = Path('.').resolve().parent  # Lùi một cấp ra khỏi notebook/
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

import importlib
import train.knn_trainer
import model.knn
importlib.reload(model.knn)            # reload module phụ thuộc trước
importlib.reload(train.knn_trainer)   # reload module chính sau
from model.knn import KNeighbors
from train.knn_trainer import KNeighborsTrainer

In [21]:
# Lưu kết quả vào list rồi tạo DataFrame
results = []

for i, (n_neighbors, weights, p, metric, init_log) in enumerate(param_combinations):
    params = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'p': p,
        'metric': metric,
        'init_log': init_log,
    }

    config = {'params': params}

    # Train
    trainer = KNeighborsTrainer(config)
    model, train_metrics, val_metrics = trainer.train(X_train, y_train, X_val, y_val, experiment_dir)

    # Ghi nhận kết quả
    results.append({
        'n_neighbors': n_neighbors,
        'weights': weights,
        'p': p,
        'metric': metric,
        'init_log': init_log,
        'accuracy': val_metrics['accuracy'],
        'precision': val_metrics['precision'],
        'recall': val_metrics['recall'],
        'f1': val_metrics['f1']
    })

# Chuyển sang DataFrame
scratch_results_df = pd.DataFrame(results)

# Lưu ra file CSV
scratch_results_df.to_csv(experiment_dir / 'scratch_finetune_results.csv', index=False)

# (Tùy chọn) In top cấu hình tốt nhất theo F1-score
print(scratch_results_df.sort_values(by='f1', ascending=False).head())

2025-06-03 13:46:07,209 - INFO - K Nearest Neighbors with parameters: {'n_neighbors': 1, 'weights': 'uniform', 'p': 1, 'metric': 'minkowski', 'init_log': False}
Predicting (batched): 100%|██████████| 1327/1327 [00:02<00:00, 588.64it/s]
Predicting (batched): 100%|██████████| 443/443 [00:01<00:00, 223.58it/s]
2025-06-03 13:46:11,752 - INFO - Training metrics:
Accuracy: 0.7838
Precision: 0.7849
Recall: 0.7838
F1 Score: 0.7843
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.83      0.84     89271
           1       0.67      0.68      0.67     43398

    accuracy                           0.78    132669
   macro avg       0.75      0.76      0.76    132669
weighted avg       0.78      0.78      0.78    132669



2025-06-03 13:46:11,752 - INFO - Validation results:
Accuracy: 0.6829
Precision: 0.6829
Recall: 0.6829
F1 Score: 0.6829
Classification Report:
              precision    recall  f1-score   support

           0       0.76

    n_neighbors  weights  p     metric  init_log  accuracy  precision  \
33            9  uniform  2  minkowski     False  0.746241   0.740669   
32            9  uniform  1  minkowski     False  0.745924   0.740077   
37           10  uniform  2  minkowski     False  0.743753   0.738221   
36           10  uniform  1  minkowski     False  0.743753   0.738002   
29            8  uniform  2  minkowski     False  0.739819   0.734483   

      recall        f1  
33  0.746241  0.742513  
32  0.745924  0.741942  
37  0.743753  0.740099  
36  0.743753  0.739900  
29  0.739819  0.736386  


In [22]:
# get best hyperparameter set
best_result = scratch_results_df.loc[scratch_results_df['f1'].idxmax()]
best_params = {
    'n_neighbors': best_result['n_neighbors'],
    'weights': best_result['weights'],
    'p': best_result['p'],
    'metric': best_result['metric'],
    'init_log': best_result['init_log'],
}

print("Best parameters based on F1-score:")
print(best_params)
print("\nValidation metrics:")
print(f"Accuracy: {best_result['accuracy']:.4f}")
print(f"Precision: {best_result['precision']:.4f}")
print(f"Recall: {best_result['recall']:.4f}")
print(f"F1-score: {best_result['f1']:.4f}")


Best parameters based on F1-score:
{'n_neighbors': np.int64(9), 'weights': 'uniform', 'p': np.int64(2), 'metric': 'minkowski', 'init_log': np.False_}

Validation metrics:
Accuracy: 0.7462
Precision: 0.7407
Recall: 0.7462
F1-score: 0.7425


In [23]:
# plot into chart 

# KNN Sklearn

In [24]:
# import lib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import logging

In [30]:
results = []

for i, (n_neighbors, weights, p, metric, init_log) in enumerate(param_combinations):
    
    params = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'p': p,
        'metric': metric,
        'algorithm': 'kd_tree',
    }
   # Log cấu hình model
    logging.info(f"Training KNN using sklearn with parameters: {params}")

    # Khởi tạo model sklearn
    model = KNeighborsClassifier(**params)

    # Train model
    model.fit(X_train, y_train)

    # Dự đoán
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)

    # Tính toán metrics
    train_metrics = {
        'accuracy': accuracy_score(y_train, y_pred_train),
        'precision': precision_score(y_train, y_pred_train, average='macro'),
        'recall': recall_score(y_train, y_pred_train, average='macro'),
        'f1': f1_score(y_train, y_pred_train, average='macro')
    }

    val_metrics = {
        'accuracy': accuracy_score(y_val, y_pred_val),
        'precision': precision_score(y_val, y_pred_val, average='macro'),
        'recall': recall_score(y_val, y_pred_val, average='macro'),
        'f1': f1_score(y_val, y_pred_val, average='macro')
    }

    # Lưu kết quả
    results.append({
        'n_neighbors': n_neighbors,
        'weights': weights,
        'p': p,
        'metric': metric,
        'init_log': init_log,
        'accuracy': val_metrics['accuracy'],
        'precision': val_metrics['precision'],
        'recall': val_metrics['recall'],
        'f1': val_metrics['f1']
    })

    # Log training metrics
    logging.info("Training metrics:")
    logging.info(f"Accuracy: {train_metrics['accuracy']:.4f}")
    logging.info(f"Precision: {train_metrics['precision']:.4f}")
    logging.info(f"Recall: {train_metrics['recall']:.4f}")
    logging.info(f"F1 Score: {train_metrics['f1']:.4f}")
    logging.info("Classification Report:\n" + classification_report(y_train, y_pred_train))

    # Log validation metrics
    logging.info("Validation results:")
    logging.info(f"Accuracy: {val_metrics['accuracy']:.4f}")
    logging.info(f"Precision: {val_metrics['precision']:.4f}")
    logging.info(f"Recall: {val_metrics['recall']:.4f}")
    logging.info(f"F1 Score: {val_metrics['f1']:.4f}")
    logging.info("Classification Report:\n" + classification_report(y_val, y_pred_val))

# Chuyển thành DataFrame
sklearn_results_df = pd.DataFrame(results)

# Lưu kết quả vào CSV
sklearn_results_df.to_csv(experiment_dir / 'sklearn_finetune_results.csv', index=False)

# In ra top 5 theo f1-score
print(sklearn_results_df.sort_values(by='f1', ascending=False).head())

2025-06-03 14:12:24,681 - INFO - Training KNN using sklearn with parameters: {'n_neighbors': 1, 'weights': 'uniform', 'p': 1, 'metric': 'minkowski', 'algorithm': 'kd_tree'}
2025-06-03 14:12:41,842 - INFO - Training metrics:
2025-06-03 14:12:41,843 - INFO - Accuracy: 0.7861
2025-06-03 14:12:41,844 - INFO - Precision: 0.7592
2025-06-03 14:12:41,845 - INFO - Recall: 0.7610
2025-06-03 14:12:41,845 - INFO - F1 Score: 0.7601
2025-06-03 14:12:41,881 - INFO - Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.84      0.84     88462
           1       0.68      0.69      0.68     44207

    accuracy                           0.79    132669
   macro avg       0.76      0.76      0.76    132669
weighted avg       0.79      0.79      0.79    132669

2025-06-03 14:12:41,882 - INFO - Validation results:
2025-06-03 14:12:41,883 - INFO - Accuracy: 0.6841
2025-06-03 14:12:41,883 - INFO - Precision: 0.6464
2025-06-03 14:12:41,884 - INFO - Recall:

    n_neighbors  weights  p     metric  init_log  accuracy  precision  \
33            9  uniform  2  minkowski     False  0.741017   0.707861   
32            9  uniform  1  minkowski     False  0.740927   0.707756   
24            7  uniform  1  minkowski     False  0.736404   0.702710   
25            7  uniform  2  minkowski     False  0.736359   0.702659   
37           10  uniform  2  minkowski     False  0.745381   0.713843   

      recall        f1  
33  0.703019  0.705234  
32  0.702900  0.705121  
24  0.699101  0.700785  
25  0.699051  0.700734  
37  0.690217  0.698138  


In [None]:
# get best hyperparameter set
best_result = sklearn_results_df.loc[sklearn_results_df['f1'].idxmax()]
best_params = {
    'n_neighbors': best_result['n_neighbors'],
    'weights': best_result['weights'],
    'p': best_result['p'],
    'metric': best_result['metric'],
    'init_log': best_result['init_log'],
}

print("Best parameters based on F1-score:")
print(best_params)
print("\nValidation metrics:")
print(f"Accuracy: {best_result['accuracy']:.4f}")
print(f"Precision: {best_result['precision']:.4f}")
print(f"Recall: {best_result['recall']:.4f}")
print(f"F1-score: {best_result['f1']:.4f}")


In [None]:
# plot into charts using finetune_results.csv