#PREPARING DATASET

In [None]:
%pip install pandas numpy matplotlib seaborn scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_red = pd.read_csv("winequality-red.csv", sep=';', quotechar='"')
df_white = pd.read_csv("winequality-white.csv", sep=';', quotechar='"')

# convert to category labels
def categorize_quality(value):
    if value <= 4:
        return "Low"
    elif value <= 6:
        return "Standard"
    else:
        return "High"

# Step 1: Categorize and preserve label column
df_red["quality_label"] = df_red["quality"].apply(categorize_quality)
df_white["quality_label"] = df_white["quality"].apply(categorize_quality)

# Step 2: Map original "quality_label" to numeric values
label_map = {"Low": 0, "Standard": 1, "High": 2}
df_red["quality"] = df_red["quality_label"].map(label_map)
df_white["quality"] = df_white["quality_label"].map(label_map)


# Then map category labels to numeric values
label_map = {
    "Low": 0,
    "Standard": 1,
    "High": 2
}
df_red["quality"] = df_red["quality"].map(label_map)
df_white["quality"] = df_white["quality"].apply(categorize_quality).map(label_map)

# Count each category
label_counts = df_red['quality_label'].value_counts()

# Plot
plt.figure(figsize=(6, 6))
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=140, colors=['#f94144','#f3722c','#90be6d'])
plt.title('Red Wine Quality Distribution')
plt.axis('equal')  # Equal aspect ratio makes the pie round
plt.show()



#SLPIT AND VISUALIZE DATA

In [None]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
from collections import Counter

def split_data(df, test_size):
    X = df.drop(columns=['quality', 'quality_label'])
    y = df['quality_label']
    
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

# Define split ratios
splits = [0.6, 0.4, 0.2, 0.1]  # These are test sizes (40/60 → 0.6, etc.)
titles = ["Train 40% / Test 60%", "Train 60% / Test 40%", "Train 80% / Test 20%", "Train 90% / Test 10%"]

# Loop through each split
for i, test_size in enumerate(splits):
    X_train, X_test, y_train, y_test = split_data(df_red, test_size=test_size)
    
    # Count labels
    train_counts = y_train.value_counts()
    test_counts = y_test.value_counts()

    # Plot
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    
    axes[0].pie(train_counts, labels=train_counts.index, autopct='%1.1f%%',
                startangle=140, colors=['#f94144','#f3722c','#90be6d'])
    axes[0].set_title(f"Train Set - {titles[i]}")
    axes[0].axis('equal')

    axes[1].pie(test_counts, labels=test_counts.index, autopct='%1.1f%%',
                startangle=140, colors=['#f94144','#f3722c','#90be6d'])
    axes[1].set_title(f"Test Set - {titles[i]}")
    axes[1].axis('equal')

    plt.tight_layout()
    plt.show()



#BUILD AND TRAIN DECISION TREE

In [None]:
%pip install pydotplus
%pip install graphviz

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
import graphviz
import pydotplus


# Function to fit decision tree classifier
def fit_decision_tree(X_train, y_train):
    # Initialize the DecisionTreeClassifier with 'entropy' for information gain
    dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=42)
    
    # Fit the classifier to the training data
    dt_classifier.fit(X_train, y_train)
    
    return dt_classifier

from sklearn.tree import export_graphviz
import graphviz

def visualize_tree(model, feature_names, title):
    # Export to DOT format
    dot_data = export_graphviz(
        model,
        out_file=None,
        feature_names=feature_names,
        class_names=model.classes_,
        filled=True,
        rounded=True,
        special_characters=True
    )
    
    # Create Graphviz object
    graph = graphviz.Source(dot_data)
    
    # Render and display in notebook
    display(graph)  # ← This forces Jupyter to show it
    
    # Optional: Save to file (e.g., PNG)
    graph.render(filename=f"tree_{title}", format='png', cleanup=True)

for i, test_size in enumerate(splits):
    X_train, X_test, y_train, y_test = split_data(df_red, test_size=test_size)
    dt_model = fit_decision_tree(X_train, y_train)
    custom_title = f"split_{i}"  # Use a simple identifier without invalid characters
    print(f"Decision Tree for {titles[i]}:")
    visualize_tree(dt_model, X_train.columns, custom_title)



#EVALUATE 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
for i, test_size in enumerate(splits):
    # Split data according to current test_size
    X_train, X_test, y_train, y_test = split_data(df_red, test_size=test_size)
    
    # Train the decision tree model on the training data
    dt_model = fit_decision_tree(X_train, y_train)
    
    # Predict on test data
    y_pred = dt_model.predict(X_test)

    # Classification report
    report = classification_report(y_test, y_pred)
    print(report)
    
    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Visualize the confusion matrix using Seaborn
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=['Low', 'Standard', 'High'], 
                yticklabels=['Low', 'Standard', 'High'])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Confusion Matrix - {titles[i]}")
    plt.show()


#DEPTH AND ACCURACY 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import re

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def split_data(df, test_size):
    X = df.drop(columns=['quality', 'quality_label'])
    y = df['quality_label']
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

def fit_decision_tree(X_train, y_train, max_depth=None):
    # Initialize the DecisionTreeClassifier with 'entropy' (information gain)
    dt_classifier = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=42)
    dt_classifier.fit(X_train, y_train)
    return dt_classifier

def make_safe_filename(title):
    # Replace spaces, slashes, and percent signs with underscores for a safe file name
    safe_title = re.sub(r'[ /%]', '_', title)
    return safe_title

def visualize_tree(model, feature_names, title):
    # Export the decision tree in DOT format
    dot_data = export_graphviz(
        model,
        out_file=None,
        feature_names=feature_names,
        class_names=model.classes_,
        filled=True,
        rounded=True,
        special_characters=True
    )
    
    # Create a Graphviz source object
    graph = graphviz.Source(dot_data)
    display(graph)  # To display the tree in Jupyter Notebook

    # Save the tree as a PNG file with a safe filename
    safe_title = make_safe_filename(title)
    graph.render(filename=f"tree_{safe_title}", format='png', cleanup=True)

# Set up the 80/20 (train/test) split
X_train, X_test, y_train, y_test = split_data(df_red, test_size=0.2)

# Define the max_depth values to test
max_depth_values = [None, 2, 3, 4, 5, 6, 7]
accuracy_scores = {}

# Loop over each max_depth value
for depth in max_depth_values:
    # Create a title string for the current depth
    title_str = f"max_depth = {depth}" if depth is not None else "max_depth = None"
    
    # Fit the decision tree classifier with the given max_depth
    dt_model = fit_decision_tree(X_train, y_train, max_depth=depth)
    
    # Visualize the decision tree using Graphviz
    print(f"Decision Tree Visualization for {title_str}:")
    visualize_tree(dt_model, X_train.columns, title_str)
    
    # Predict on the test set and compute accuracy
    y_pred = dt_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_scores[title_str] = acc

# Print the accuracy scores as a table
print("max_depth\t\tAccuracy")
for title, acc in accuracy_scores.items():
    print(f"{title}\t\t{acc:.4f}")
    
# Create a chart of accuracy vs. max_depth
# Use x_labels exactly as defined in the keys of the accuracy_scores dictionary.
x_labels = list(accuracy_scores.keys())
acc_values = list(accuracy_scores.values())

plt.figure(figsize=(8, 6))
plt.plot(x_labels, acc_values, marker='o', linestyle='--', color='b')
plt.xlabel("max_depth")
plt.ylabel("Accuracy Score")
plt.title("Accuracy Score vs. max_depth")
plt.grid(True)
plt.show()
