In [9]:
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
df_iris = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df_iris['target'] = iris.target

# Display the first few rows
print("Bộ dữ liệu Iris (5 hàng đầu):")
print(df_iris.head())

# Display the target names
print("\nTên lớp mục tiêu:")
print(iris.target_names)

Bộ dữ liệu Iris (5 hàng đầu):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

Tên lớp mục tiêu:
['setosa' 'versicolor' 'virginica']


In [10]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X_iris = df_iris.drop('target', axis=1)
y_iris = df_iris['target']

# Split data into training and testing sets
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.2, random_state=42
)

print("Kích thước tập huấn luyện (features):", X_train_iris.shape)
print("Kích thước tập kiểm tra (features):", X_test_iris.shape)
print("Kích thước tập huấn luyện (target):", y_train_iris.shape)
print("Kích thước tập kiểm tra (target):", y_test_iris.shape)

Kích thước tập huấn luyện (features): (120, 4)
Kích thước tập kiểm tra (features): (30, 4)
Kích thước tập huấn luyện (target): (120,)
Kích thước tập kiểm tra (target): (30,)


## From Scratch - ID3

### 1. Tính entropy

In [11]:
import numpy as np

def calculate_entropy(target):
  """
  Calculates the entropy of a target variable.

  Args:
    target: A pandas Series representing the target variable.

  Returns:
    The calculated entropy value.
  """
  class_counts = target.value_counts()
  probabilities = class_counts / len(target)
  entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9)) # Add a small epsilon to avoid log(0)
  return entropy

# Calculate entropy for the 'play' variable
entropy_play = calculate_entropy(y_train_iris)
print(f"Entropy of 'play' variable: {entropy_play:.4f}")

Entropy of 'play' variable: 1.5847


### 2. Tính information gain

In [12]:
def calculate_information_gain(df, attribute_col, target_col):
  """
  Calculates the information gain for a given attribute.

  Args:
    df: A pandas DataFrame.
    attribute_col: The name of the attribute column.
    target_col: The name of the target variable column.

  Returns:
    The calculated information gain.
  """
  original_entropy = calculate_entropy(df[target_col])
  weighted_entropy = 0

  # For continuous attributes, a simple split point needs to be determined.
  # A common approach for ID3 with continuous data is to sort the data by the attribute
  # and consider midpoints between different target values as potential split points.
  # However, for simplicity and to align with the existing function structure,
  # we will calculate information gain based on the unique values present in the data,
  # which is more suitable for categorical features.
  # To truly handle continuous features in ID3, a different approach for finding the best split point is needed.

  # Assuming for demonstration purposes that we are treating unique values as categories
  for value in df[attribute_col].unique():
    subset = df[df[attribute_col] == value]
    subset_entropy = calculate_entropy(subset[target_col])
    proportion = len(subset) / len(df)
    weighted_entropy += proportion * subset_entropy

  information_gain = original_entropy - weighted_entropy
  return information_gain

# Example usage with Iris dataset (using the training data split earlier)
# Note: This calculation is a simplified approach for continuous data in the context of ID3
# and might not represent the optimal splits for continuous features.
# A proper ID3 implementation for continuous features involves finding the best split point.

# Use the training DataFrame (df_train_scratch) which contains original columns
# Assuming df_train_scratch is available from previous steps and contains the 'target' column

# Calculate Information Gain for each feature in the training set
# Make sure df_train_scratch and its columns exist and are accessible
# If df_train_scratch is not available, use df_iris and consider only the training indices

# To ensure we use the training data, let's use X_train_iris and y_train_iris
# We need to combine them into a DataFrame for the calculate_information_gain function

train_df_iris = X_train_iris.copy()
train_df_iris['target'] = y_train_iris.values

print("Information Gain for Iris Dataset Features (on training data):")
for feature in X_train_iris.columns:
    ig = calculate_information_gain(train_df_iris, feature, 'target')
    print(f"Information Gain for '{feature}': {ig:.4f}")

Information Gain for Iris Dataset Features (on training data):
Information Gain for 'sepal length (cm)': 0.8779
Information Gain for 'sepal width (cm)': 0.5557
Information Gain for 'petal length (cm)': 1.4387
Information Gain for 'petal width (cm)': 1.4113


### Xây dựng cây quyết định bằng ID3

In [13]:
def build_id3_tree(df, target_col, attributes):
    """
    Recursively builds an ID3 decision tree.

    Args:
        df: The current DataFrame subset.
        target_col: The name of the target variable column.
        attributes: A list of attribute columns to consider for splitting.

    Returns:
        A dictionary representing the decision tree node or a leaf node value.
    """
    # Base Case 1: If all instances in the subset belong to the same class
    if len(df[target_col].unique()) == 1:
        return df[target_col].iloc[0]

    # Base Case 2: If there are no more attributes to split on or the DataFrame is empty
    if not attributes or df.empty:
        # Return the majority class
        return df[target_col].mode()[0]

    # Recursive Step: Find the best attribute to split on
    information_gains = {}
    for attribute in attributes:
        information_gains[attribute] = calculate_information_gain(df, attribute, target_col)

    best_attribute = max(information_gains, key=information_gains.get)

    # Create the tree node
    tree = {best_attribute: {}}

    # Get remaining attributes for recursive calls
    remaining_attributes = [attr for attr in attributes if attr != best_attribute]

    # Build subtrees for each unique value of the best attribute
    # Note: For continuous attributes in a proper ID3 implementation,
    # this loop would iterate through potential split points, not unique values.
    # This implementation is a simplification for demonstration.
    for value in df[best_attribute].unique():
        subset = df[df[best_attribute] == value].copy()  # Use .copy() to avoid SettingWithCopyWarning
        subtree = build_id3_tree(subset, target_col, remaining_attributes)
        tree[best_attribute][value] = subtree

    return tree

# Define the attributes for the Iris dataset (excluding the target)
iris_attributes = [col for col in X_train_iris.columns]

# Combine training features and target into a single DataFrame for building the tree
train_df_iris_scratch = X_train_iris.copy()
train_df_iris_scratch['target'] = y_train_iris.values

# Build the ID3 tree using the training data
# Note: This implementation for continuous data is a simplification.
# A full ID3 for continuous features would involve finding optimal split points.
id3_tree_iris_scratch = build_id3_tree(train_df_iris_scratch, 'target', iris_attributes)

# Print the built tree (optional, for visualization)
import json
print("Built ID3 Tree for Iris Dataset:")
# Due to the potential complexity of the tree with continuous features treated as categories,
# printing the full tree might be very verbose.
# print(json.dumps(id3_tree_iris_scratch, indent=2))
print("ID3 tree building complete. The structure can be complex due to continuous features.")

Built ID3 Tree for Iris Dataset:
ID3 tree building complete. The structure can be complex due to continuous features.


In [14]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score

# Define the predict_id3 function (assuming it's not defined elsewhere)
def predict_id3(tree, data_point):
    """
    Predicts the target value for a single data point using the ID3 tree.

    Args:
        tree: The ID3 decision tree (dictionary).
        data_point: A dictionary representing a single data point (feature values).

    Returns:
        The predicted target value.
    """
    if not isinstance(tree, dict):
        # If the node is a leaf node, return the class label
        return tree
    else:
        # If the node is an internal node, get the splitting attribute
        attribute = list(tree.keys())[0]
        # Get the value of the attribute for the data point
        attribute_value = data_point.get(attribute) # Use .get() to handle missing attributes gracefully

        # Traverse the tree based on the attribute value
        if attribute_value in tree[attribute]:
            subtree = tree[attribute][attribute_value]
            return predict_id3(subtree, data_point)
        else:
            # Handle cases where the attribute value is not found in the tree
            # This might happen with continuous data and unique values
            # For simplicity, we can return the most common class in the training data
            # or a default value. Here, we'll return None or handle as an error.
            # A more robust solution for continuous data would involve finding the
            # appropriate branch based on the split point.
            print(f"Warning: Attribute value '{attribute_value}' not found for attribute '{attribute}' in the tree.")
            # For now, let's return a default or handle this case based on requirements.
            # Returning None might cause issues later, returning a majority class from the original training data is safer.
            # Assuming y_train_iris is available and represents the training target values
            return y_train_iris.mode()[0] # Return the majority class from the original training data


# 1. Tách tập dữ liệu df thành các tập huấn luyện và kiểm tra.
# We already have X_train_iris, X_test_iris, y_train_iris, y_test_iris from a previous step.
# No need to split again here.

# 2. Xây dựng cây ID3 trên tập dữ liệu huấn luyện
# We already built id3_tree_iris_scratch in a previous step.
# No need to build again here.

# 3. Dự đoán nhãn cho tập dữ liệu kiểm tra
predictions_scratch = []
# Iterate through each row of the test DataFrame (X_test_iris)
for index, row in X_test_iris.iterrows():
    data_point = row.to_dict()
    prediction = predict_id3(id3_tree_iris_scratch, data_point)
    predictions_scratch.append(prediction)

# 4. Chuyển danh sách các dự đoán thành một Series của pandas
y_pred_scratch = pd.Series(predictions_scratch, index=X_test_iris.index)

# 5. Tính toán độ chính xác của mô hình
accuracy_scratch = accuracy_score(y_test_iris, y_pred_scratch)

# 6. In độ chính xác.
print(f"Độ chính xác của mô hình ID3 từ scratch: {accuracy_scratch:.2f}")

Độ chính xác của mô hình ID3 từ scratch: 0.83


## Dùng Scikit Learn


In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
# Khởi tạo và huấn luyện mô hình cây quyết định
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_iris, y_train_iris)

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test_iris)

# Đánh giá mô hình
accuracy = accuracy_score(y_test_iris, y_pred)
report = classification_report(y_test_iris, y_pred)

print(f"Độ chính xác của mô hình: {accuracy:.2f}")
print("\nBáo cáo phân loại:")
print(report)

Độ chính xác của mô hình: 1.00

Báo cáo phân loại:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

