In [13]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.environ["PATH"] += os.pathsep + r'C:\Program Files\Graphviz\bin'  
from graphviz import Source

In [14]:
# Load and prepare data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=";")
data["quality"] = data["quality"].apply(lambda x: "Low" if x <= 5 else "Medium" if x == 6 else "High")
X = data.drop("quality", axis=1)
y = data["quality"]

In [18]:
# 2.1 Prepare datasets
proportions = [(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]
subsets = {}
for train_size, test_size in proportions:
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, stratify=y, shuffle=True, random_state=42)
    subsets[f"train_{int(train_size*100)}"] = (X_train, y_train)
    subsets[f"test_{int(test_size*100)}"] = (X_test, y_test)
# Visualize class distribution (save as .png)


In [19]:
# 3. Build decision trees (Section 2.2)
os.makedirs("../outputs/wine_quality_analysis/trees", exist_ok=True)
for train_size, _ in proportions:
    train_key = f"train_{int(train_size*100)}"
    X_train, y_train = subsets[train_key]
    clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
    clf.fit(X_train, y_train)
    dot_data = tree.export_graphviz(clf, out_file=None, feature_names=X.columns, class_names=["Low", "Medium", "High"], filled=True, rounded=True)
    graph = Source(dot_data)
    graph.render(f"../outputs/wine_quality_analysis/trees/wine_quality_tree_train_{int(train_size*100)}", format="pdf", cleanup=True)
    print(f"Successfully created ../outputs/wine_quality_analysis/trees/wine_quality_tree_train_{int(train_size*100)}.pdf")

Successfully created ../outputs/wine_quality_analysis/trees/wine_quality_tree_train_40.pdf
Successfully created ../outputs/wine_quality_analysis/trees/wine_quality_tree_train_60.pdf
Successfully created ../outputs/wine_quality_analysis/trees/wine_quality_tree_train_80.pdf
Successfully created ../outputs/wine_quality_analysis/trees/wine_quality_tree_train_90.pdf


In [20]:
# 4. Evaluate decision trees (Section 2.3)
os.makedirs("../outputs/wine_quality_analysis/matrices", exist_ok=True)
os.makedirs("../outputs/wine_quality_analysis/reports", exist_ok=True)
for train_size, test_size in proportions:
    train_key = f"train_{int(train_size*100)}"
    test_key = f"test_{int(test_size*100)}"
    X_train, y_train = subsets[train_key]
    X_test, y_test = subsets[test_key]
    clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    with open(f"../outputs/wine_quality_analysis/reports/classification_report_{int(train_size*100)}_{int(test_size*100)}.txt", "w") as f:
        f.write(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Low", "Medium", "High"])
    disp.plot()
    plt.savefig(f"../outputs/wine_quality_analysis/matrices/confusion_matrix_{int(train_size*100)}_{int(test_size*100)}.png")
    plt.close()

In [24]:
# 5. Depth and accuracy (Section 2.4)
os.makedirs("../outputs/wine_quality_analysis/acc", exist_ok=True)
X_train, y_train = subsets["train_80"]
X_test, y_test = subsets["test_20"]
depths = [None, 2, 3, 4, 5, 6, 7]
accuracies = []
for depth in depths:
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=depth, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    dot_data = tree.export_graphviz(clf, out_file=None, feature_names=X.columns, class_names=["Low", "Medium", "High"], filled=True, rounded=True)
    graph = Source(dot_data)
    graph.render(f"../outputs/wine_quality_analysis/trees/tree_depth_{depth if depth else 'None'}", format="pdf", cleanup=True)
    print(f"Successfully created ../outputs/wine_quality_analysis/trees/tree_depth_{depth if depth else 'None'}.pdf")

# Plot accuracy vs depth
plt.figure(figsize=(8, 5))
plt.plot([str(d) if d else "None" for d in depths], accuracies, marker="o")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.title("Accuracy vs max_depth for Wine Quality (80/20)")
plt.grid(True)
os.makedirs("../outputs/wine_quality_analysis/charts", exist_ok=True)  # Thêm lệnh này
plt.savefig("../outputs/wine_quality_analysis/charts/accuracy_vs_depth.png")
plt.close()

Successfully created ../outputs/wine_quality_analysis/trees/tree_depth_None.pdf
Successfully created ../outputs/wine_quality_analysis/trees/tree_depth_2.pdf
Successfully created ../outputs/wine_quality_analysis/trees/tree_depth_3.pdf
Successfully created ../outputs/wine_quality_analysis/trees/tree_depth_4.pdf
Successfully created ../outputs/wine_quality_analysis/trees/tree_depth_5.pdf
Successfully created ../outputs/wine_quality_analysis/trees/tree_depth_6.pdf
Successfully created ../outputs/wine_quality_analysis/trees/tree_depth_7.pdf
