# C4.5 Decision Tree Visualization

This notebook visualizes the trained C4.5-style Decision Tree model for tech career recommendations.

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from joblib import load, dump
import os

# Set up matplotlib for inline display
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 10)
plt.rcParams['figure.dpi'] = 100

print("‚úÖ Libraries imported successfully!")

## 1. Load Training Data

In [None]:
# Load the training dataset
DATA_PATH = "data/training_data.csv"

if not os.path.exists(DATA_PATH):
    print("‚ùå Training data not found! Run build_training_dataset.py first.")
else:
    df = pd.read_csv(DATA_PATH)
    print(f"‚úÖ Loaded {len(df)} training records")
    print(f"\nDataset shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()[:10]}...")  # Show first 10 columns
    print(f"\nFirst few rows:")
    display(df.head())

## 2. Check Class Distribution

In [None]:
# Check the distribution of tech fields
if 'tech_field_id' in df.columns:
    class_counts = df['tech_field_id'].value_counts()
    print("Tech Field Distribution:")
    print(class_counts)
    
    # Plot distribution
    plt.figure(figsize=(10, 6))
    class_counts.plot(kind='bar', color='skyblue', edgecolor='black')
    plt.title('Distribution of Tech Field Recommendations', fontsize=16, fontweight='bold')
    plt.xlabel('Tech Field ID', fontsize=12)
    plt.ylabel('Number of Students', fontsize=12)
    plt.xticks(rotation=0)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("‚ùå 'tech_field_id' column not found!")

## 3. Train C4.5-Style Decision Tree

In [None]:
# Prepare features and target
X = df.drop('tech_field_id', axis=1)
y = df['tech_field_id']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Train C4.5-style Decision Tree with entropy
clf = DecisionTreeClassifier(
    criterion='entropy',        # Information Gain (C4.5's method)
    max_depth=15,              # Prevent overfitting
    min_samples_split=5,       # Min samples to split node
    min_samples_leaf=2,        # Min samples in leaf node
    random_state=42
)

print("\nüå≥ Training C4.5-style Decision Tree...")
clf.fit(X, y)
print("‚úÖ Training complete!")

# Display tree statistics
print(f"\nüìä Tree Statistics:")
print(f"  - Number of nodes: {clf.tree_.node_count}")
print(f"  - Number of leaves: {clf.tree_.n_leaves}")
print(f"  - Max depth: {clf.tree_.max_depth}")
print(f"  - Number of features: {clf.n_features_in_}")
print(f"  - Number of classes: {len(clf.classes_)}")

## 4. Visualize the Decision Tree (Full View)

In [None]:
# Create a large visualization of the full tree
plt.figure(figsize=(30, 15))

plot_tree(
    clf, 
    filled=True,                          # Color nodes by majority class
    feature_names=X.columns,              # Show feature names (Q1, Q2, etc.)
    class_names=[str(c) for c in clf.classes_],  # Show class names
    rounded=True,                         # Rounded boxes
    fontsize=8,                          # Font size
    proportion=True                       # Show proportions instead of counts
)

plt.title('C4.5-Style Decision Tree - Full View', fontsize=20, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nüí° Tip: Zoom in to see individual nodes clearly!")

## 5. Visualize Top Levels Only (Simplified View)

In [None]:
# Train a shallow tree for better visualization
clf_shallow = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=4,  # Only 4 levels deep
    min_samples_split=5,
    random_state=42
)

clf_shallow.fit(X, y)

# Visualize
plt.figure(figsize=(25, 12))

plot_tree(
    clf_shallow, 
    filled=True,
    feature_names=X.columns,
    class_names=[str(c) for c in clf_shallow.classes_],
    rounded=True,
    fontsize=10,
    proportion=True
)

plt.title('C4.5-Style Decision Tree - Top 4 Levels (Simplified)', fontsize=20, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nüìù This simplified view shows only the top decision-making rules.")

## 6. Export Tree Rules as Text

In [None]:
# Export the decision rules as text
tree_rules = export_text(clf, feature_names=list(X.columns), max_depth=5)

print("üå≥ Decision Tree Rules (Top 5 Levels):")
print("=" * 80)
print(tree_rules)
print("=" * 80)

## 7. Feature Importance Analysis

In [None]:
# Get feature importances
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüìä Top 20 Most Important Questions:")
print(feature_importance.head(20))

# Plot top 20 features
plt.figure(figsize=(12, 8))
top_20 = feature_importance.head(20)
plt.barh(top_20['feature'], top_20['importance'], color='coral', edgecolor='black')
plt.xlabel('Importance Score (Information Gain)', fontsize=12)
plt.ylabel('Question (Feature)', fontsize=12)
plt.title('Top 20 Most Important Questions in Decision Making', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()  # Highest importance at top
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Test Prediction with Sample Data

In [None]:
# Sample student responses (70 features)
sample_features = [
    3, 2, 2, 3, 3, 4, 1, 0, 5, 2,
    3, 3, 5, 0, 5, 1, 3, 0, 5, 1,
    1, 0, 3, 2, 2, 0, 2, 0, 3, 5,
    0, 1, 5, 3, 3, 4, 1, 2, 4, 0,
    0, 4, 0, 0, 5, 1, 1, 1, 0, 0,
    3, 1, 0, 2, 1, 3, 3, 1, 2, 2,
    0, 2, 2, 3, 1, 1, 1, 3, 2, 3
]

# Make prediction
sample_df = pd.DataFrame([sample_features], columns=X.columns)
prediction = clf.predict(sample_df)[0]
probabilities = clf.predict_proba(sample_df)[0]

print("\nüéØ Sample Student Prediction:")
print(f"  Predicted Tech Field ID: {prediction}")
print(f"\n  Probability Distribution:")

for class_id, prob in zip(clf.classes_, probabilities):
    print(f"    Tech Field {class_id}: {prob*100:.2f}%")

# Visualize probabilities
plt.figure(figsize=(10, 6))
plt.bar([f"Field {c}" for c in clf.classes_], probabilities, color='lightgreen', edgecolor='black')
plt.xlabel('Tech Field', fontsize=12)
plt.ylabel('Probability', fontsize=12)
plt.title('Prediction Probabilities for Sample Student', fontsize=16, fontweight='bold')
plt.ylim(0, 1)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Save the Model

In [None]:
# Save the trained model
MODEL_PATH = "model.pkl"
dump(clf, MODEL_PATH)
print(f"\n‚úÖ Model saved to {MODEL_PATH}")

# Save tree visualization as PNG
plt.figure(figsize=(30, 15))
plot_tree(
    clf, 
    filled=True,
    feature_names=X.columns,
    class_names=[str(c) for c in clf.classes_],
    rounded=True,
    fontsize=8,
    proportion=True
)
plt.title('C4.5-Style Decision Tree', fontsize=20, fontweight='bold', pad=20)
plt.savefig('decision_tree.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"‚úÖ Tree visualization saved to decision_tree.png")