# Assignment 1-1

## Import thư viện

In [22]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

## Load dữ liệu Digits

In [24]:
digits = load_digits()
X, y = digits.data, digits.target
print(X)
print(y)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
[0 1 2 ... 8 9 8]


## Chia dữ liệu 60% - 20% - 20%

In [26]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Train:", len(X_train))
print("Val:", len(X_val))
print("Test:", len(X_test))

Train: 1078
Val: 359
Test: 360


## Các mô hình và danh sách siêu tham số

In [28]:
models = {
    "Logistic Regression": (
        LogisticRegression(max_iter=2000),
        {"C": [0.1, 1.0, 10]}
    ),
    "Decision Tree": (
        DecisionTreeClassifier(),
        {"max_depth": [5, 10, 15, None]}
    ),
    "KNN": (
        KNeighborsClassifier(),
        {"n_neighbors": [3, 5, 7, 9]}
    ),
    "Neural Network": (
        MLPClassifier(max_iter=300),
        {"hidden_layer_sizes": [(50,), (100,), (50,50)]}
    )
}

## Huấn luyện + chọn siêu tham số theo tập VALIDATION

In [30]:
best_models = {}
results = {}

for name, (model, params) in models.items():
    best_acc = 0
    best_param = None
    best_model = None

    print(f"\n---- {name} ----")

    for param_name, param_values in params.items():
        for v in param_values:
            model.set_params(**{param_name: v})
            model.fit(X_train, y_train)

            pred_val = model.predict(X_val)
            acc = accuracy_score(y_val, pred_val)

            print(f"{param_name} = {v} → Val Acc = {acc:.4f}")

            if acc > best_acc:
                best_acc = acc
                best_param = (param_name, v)
                best_model = model

    best_models[name] = best_model
    results[name] = (best_param, best_acc)


---- Logistic Regression ----
C = 0.1 → Val Acc = 0.9694
C = 1.0 → Val Acc = 0.9638
C = 10 → Val Acc = 0.9694

---- Decision Tree ----
max_depth = 5 → Val Acc = 0.7159
max_depth = 10 → Val Acc = 0.8440
max_depth = 15 → Val Acc = 0.8357
max_depth = None → Val Acc = 0.8301

---- KNN ----
n_neighbors = 3 → Val Acc = 0.9889
n_neighbors = 5 → Val Acc = 0.9861
n_neighbors = 7 → Val Acc = 0.9833
n_neighbors = 9 → Val Acc = 0.9833

---- Neural Network ----
hidden_layer_sizes = (50,) → Val Acc = 0.9721
hidden_layer_sizes = (100,) → Val Acc = 0.9777
hidden_layer_sizes = (50, 50) → Val Acc = 0.9749


## Đánh giá mô hình tốt nhất trên tập TEST

In [32]:
test_scores = {}

for name, model in best_models.items():
    pred_test = model.predict(X_test)
    acc = accuracy_score(y_test, pred_test)
    test_scores[name] = acc
    print(f"{name}: Test Accuracy = {acc:.4f}")

Logistic Regression: Test Accuracy = 0.9639
Decision Tree: Test Accuracy = 0.8528
KNN: Test Accuracy = 0.9778
Neural Network: Test Accuracy = 0.9667


## So sánh các mô hình

In [34]:
for name in test_scores:
    print(f"{name}: best param = {results[name][0]}, "
          f"val_acc = {results[name][1]:.4f}, "
          f"test_acc = {test_scores[name]:.4f}")

Logistic Regression: best param = ('C', 0.1), val_acc = 0.9694, test_acc = 0.9639
Decision Tree: best param = ('max_depth', 10), val_acc = 0.8440, test_acc = 0.8528
KNN: best param = ('n_neighbors', 3), val_acc = 0.9889, test_acc = 0.9778
Neural Network: best param = ('hidden_layer_sizes', (100,)), val_acc = 0.9777, test_acc = 0.9667


In [None]:
## Bảng tổng hợp kêt

In [None]:
rows = []

for name, model in best_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average=None)   # recall cho từng lớp

    row = [name, acc] + list(recall)
    rows.append(row)

cols = ["Model", "Accuracy"] + [f"Recall_{i}" for i in range(10)]
df_results = pd.DataFrame(rows, columns=cols)

df_results