In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine_data = pd.read_csv(url, sep=';')

print(wine_data.head())
print(wine_data.dtypes)
print(wine_data.isnull().sum())

wine_data = wine_data.dropna()

feature_columns = wine_data.columns[:-1]
X = wine_data[feature_columns].astype(float).values

y = (wine_data['quality'] >= 6).astype(int).values


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
train_data = np.hstack((X_train, y_train.reshape(-1, 1)))


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

CART

In [None]:
def gini_index(groups, classes):
    n_instances = sum([len(group) for group in groups])
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            proportion = [row[-1] for row in group].count(class_val) / float(size)
            score += proportion * proportion
        gini += (1.0 - score) * (size / n_instances)
    return gini

In [None]:
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

In [None]:
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left, max_depth, min_size, depth + 1)
        split(node['left'], max_depth, min_size, depth + 1)
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right, max_depth, min_size, depth + 1)
        split(node['right'], max_depth, min_size, depth + 1)

In [None]:
def get_split(dataset, max_depth, min_size, depth=1):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            print(f"Testing split: Feature index {index}, Feature value {row[index]}, Gini = {gini:.4f}")
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    print(f"\nBest split: Feature index {b_index}, Feature value {b_value}, Gini = {b_score:.4f}\n")
    return {'index': b_index, 'value': b_value, 'groups': b_groups}

In [None]:
def build_tree(train, max_depth, min_size):
    root = get_split(train, max_depth, min_size)
    split(root, max_depth, min_size, 1)
    return root

In [None]:
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

In [None]:
cart_tree = build_tree(train_data, max_depth=5, min_size=10)
cart_predictions = [predict(cart_tree, row) for row in X_test]
cart_accuracy = accuracy_score(y_test, cart_predictions)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Testing split: Feature index 7, Feature value 0.9951, Gini = 0.2096
Testing split: Feature index 7, Feature value 0.99528, Gini = 0.2099
Testing split: Feature index 7, Feature value 0.99397, Gini = 0.2112
Testing split: Feature index 7, Feature value 0.99552, Gini = 0.2105
Testing split: Feature index 7, Feature value 0.9976, Gini = 0.2101
Testing split: Feature index 7, Feature value 0.9947, Gini = 0.2110
Testing split: Feature index 7, Feature value 0.999, Gini = 0.2106
Testing split: Feature index 7, Feature value 0.99524, Gini = 0.2102
Testing split: Feature index 7, Feature value 0.99674, Gini = 0.2094
Testing split: Feature index 7, Feature value 0.99508, Gini = 0.2098
Testing split: Feature index 7, Feature value 0.9956, Gini = 0.2106
Testing split: Feature index 7, Feature value 0.9934, Gini = 0.2099
Testing split: Feature index 7, Feature value 0.99652, Gini = 0.2076
Testing split: Feature index 7, Feature value

In [None]:
print("CART Accuracy:", cart_accuracy)

print("CART Classification Report:\n", classification_report(y_test, cart_predictions))

CART Accuracy: 0.725
CART Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.76      0.72       223
           1       0.77      0.70      0.73       257

    accuracy                           0.72       480
   macro avg       0.73      0.73      0.72       480
weighted avg       0.73      0.72      0.73       480



ID3 & C4.5

In [None]:
id3_model = DecisionTreeClassifier(criterion='entropy', max_depth=5)
id3_model.fit(X_train, y_train)
id3_predictions = id3_model.predict(X_test)
id3_accuracy = accuracy_score(y_test, id3_predictions)
print("ID3 Accuracy:", id3_accuracy)
print("ID3 Classification Report:\n", classification_report(y_test, id3_predictions))

ID3 Accuracy: 0.7229166666666667
ID3 Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.78      0.72       223
           1       0.78      0.68      0.72       257

    accuracy                           0.72       480
   macro avg       0.73      0.73      0.72       480
weighted avg       0.73      0.72      0.72       480



In [None]:
c45_model = DecisionTreeClassifier(criterion='entropy', max_depth=5)
c45_model.fit(X_train, y_train)
c45_predictions = c45_model.predict(X_test)
c45_accuracy = accuracy_score(y_test, c45_predictions)
print("C4.5 Accuracy:", c45_accuracy)
print("C4.5 Classification Report:\n", classification_report(y_test, c45_predictions))

C4.5 Accuracy: 0.725
C4.5 Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.78      0.72       223
           1       0.78      0.68      0.73       257

    accuracy                           0.72       480
   macro avg       0.73      0.73      0.72       480
weighted avg       0.73      0.72      0.73       480



In [None]:
print(f"\nComparison of Accuracy:\nCART: {cart_accuracy}\nID3: {id3_accuracy}\nC4.5: {c45_accuracy}")


Comparison of Accuracy:
CART: 0.725
ID3: 0.7229166666666667
C4.5: 0.725
