<a href="https://colab.research.google.com/github/HiGiangcoder/jupyter_notebook/blob/master/Assignment0_MCS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Xây dựng Entropy và Information Gain

In [2]:
def entropy(y):
    counts = Counter(y)
    total = len(y)
    return -sum((count/total) * np.log2(count/total) for count in counts.values() if count > 0)

def information_gain(X_column, y):
    parent_entropy = entropy(y)
    values = np.unique(X_column)
    n = len(y)

    weighted_entropy = 0
    for v in values:
        y_subset = y[X_column == v]
        weighted_entropy += (len(y_subset) / n) * entropy(y_subset)

    return parent_entropy - weighted_entropy


## Node định nghĩa cây

In [3]:
class DecisionNode:
    def __init__(self, feature=None, children=None, prediction=None):
        self.feature = feature
        self.children = children or {}
        self.prediction = prediction

## Hàm ID3 đệ quy

In [4]:
def id3(X, y, features):
    # Nếu tất cả nhãn giống nhau
    if len(set(y)) == 1:
        return DecisionNode(prediction=y[0])

    # Nếu hết thuộc tính
    if len(features) == 0:
        most_common = Counter(y).most_common(1)[0][0]
        return DecisionNode(prediction=most_common)

    # Chọn feature tốt nhất
    gains = [information_gain(X[:, i], y) for i in range(len(features))]
    best_idx = np.argmax(gains)
    best_feature = features[best_idx]

    if gains[best_idx] == 0:
        most_common = Counter(y).most_common(1)[0][0]
        return DecisionNode(prediction=most_common)

    node = DecisionNode(feature=best_feature)
    feature_values = np.unique(X[:, best_idx])

    for value in feature_values:
        mask = X[:, best_idx] == value
        child_X = X[mask]
        child_y = y[mask]

        child_features = features[:best_idx] + features[best_idx+1:]
        child_X = np.delete(child_X, best_idx, axis=1)

        node.children[value] = id3(child_X, child_y, child_features)

    return node

## Hàm dự đoán

In [5]:
def predict_one(node, x, features):
    if node.prediction is not None:
        return node.prediction
    value = x[features.index(node.feature)]
    if value in node.children:
        child = node.children[value]
        new_x = list(x[:features.index(node.feature)] + x[features.index(node.feature)+1:])
        new_features = features[:features.index(node.feature)] + features[features.index(node.feature)+1:]
        return predict_one(child, new_x, new_features)
    else:
        # Nếu giá trị chưa xuất hiện trong training thì trả về nhãn phổ biến nhất
        predictions = [child.prediction for child in node.children.values() if child.prediction is not None]
        if predictions:
            return Counter(predictions).most_common(1)[0][0]
        return None

def predict(node, X, features):
    return [predict_one(node, x, features) for x in X]

## Demo với Wine dataset

In [6]:
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Rời rạc hóa: chia mỗi cột thành 3 bins (low, medium, high)
df_discrete = df.copy()
for col in data.feature_names:
    df_discrete[col] = pd.qcut(df[col], q=3, labels=[0,1,2])

X = df_discrete[data.feature_names].values
y = df_discrete['target'].values
features = list(data.feature_names)

# Chia train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Huấn luyện ID3
tree = id3(X_train, y_train, features)

# Dự đoán
y_pred = predict(tree, X_test, features)

# Đánh giá
acc = accuracy_score(y_test, y_pred)
print("Độ chính xác (accuracy):", acc)

IndexError: list index out of range