# Machine Learning Algorithms: Decision Trees & SVM

## 1. Installing Required Packages

First, let's install and import the necessary Python libraries.

In [None]:
# Install required packages
!pip install pandas matplotlib seaborn scikit-learn numpy

In [None]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

print("All packages installed and imported successfully!")

## 2. Exploring the Iris Dataset

We'll use the famous Iris dataset - it contains measurements of different iris flowers and their species. This is a classic dataset for classification problems.

In [None]:
# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data  # Features
y = iris.target  # Target (species)
feature_names = iris.feature_names
target_names = iris.target_names

# Create a DataFrame for better visualization
df = pd.DataFrame(X, columns=feature_names)
df['species'] = [target_names[i] for i in y]

print("Dataset shape:", X.shape)
print("Features:", feature_names)
print("Target classes:", target_names)

In [None]:
print("First 5 rows of our dataset:")
df.head()

In [None]:
print("Statistical summary:")
df.describe()

In [None]:
# Visualize the relationships between features

NUM_OF_PLOTS = 3 # # HINT: Change if needed
plt.figure(figsize=(8, NUM_OF_PLOTS * 4))

# Scatter plot: sepal length vs sepal width, colored by species
plt.subplot(NUM_OF_PLOTS, 1, 1)
for i, species in enumerate(target_names):
    plt.scatter(
        df.loc[df["species"] == species, "sepal length (cm)"],
        df.loc[df["species"] == species, "sepal width (cm)"],
        label=species,
        alpha=0.7
    )
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.legend()
plt.title('Sepal Length vs Sepal Width')

# TODO: Try to visualize the others relationships
# Uncomment and complete lines below, remember to change NUM_OF_PLOTS variable
# for i, species in enumerate(target_names):
#     plt.scatter(
#         df.loc[df["species"] == species, ...],
#         df.loc[df["species"] == species, ...],
#         label=species,
#         alpha=0.7
#     )
# plt.xlabel(...)
# plt.ylabel(...)
# plt.legend()
# plt.title(...)

plt.tight_layout()
plt.show()

## Correlation matrix

### A correlation matrix is a table that shows the correlation coefficients between multiple variables.
### Each cell in the matrix shows how strongly two variables are related — values range from -1 to +1:

- +1 → perfect positive correlation (they increase together)

- -1 → perfect negative correlation (one increases, the other decreases)

- 0 → no linear relationship

In [None]:
# Correlation matrix
sns.heatmap(df[feature_names].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlations')

## 3. Splitting Data into Training and Test Sets

We split our data so we can:
- **Train** our model on one part
- **Test** its performance on unseen data

This helps us check if our model can generalize to new examples.

In [None]:
# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Number of features: {X_train.shape[1]}")

## 4. Training Our Models

### Decision Tree Classifier

Decision trees make predictions by asking a series of questions about the features. They're like a flowchart!

**Key hyperparameters:**
- `max_depth`: How deep the tree can grow (controls complexity)
- `min_samples_split`: Minimum samples needed to split a node

Let's see how different settings affect our model.

In [None]:
# TODO: Try to experiment with hyperparameters
# You can check this page: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

MAX_DEPTH =   # TODO 1: Try to experiment with max_depth
MIN_SAMPLES_LEAF =  # TODO 2: Try to experiment with min_samples_leaf
# TODO 3: Try to experiment with the others hyperparameters

plt.figure(figsize=(15, 10))

# Train the model
dt = DecisionTreeClassifier(max_depth=MAX_DEPTH, min_samples_leaf=MIN_SAMPLES_LEAF, random_state=42)
dt.fit(X_train, y_train)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, dt.predict(X_train))
test_accuracy = accuracy_score(y_test, dt.predict(X_test))

# Plot the tree
plot_tree(dt, feature_names=feature_names, class_names=target_names, filled=True)
plt.title(f'Train Acc: {train_accuracy:.3f}, Test Acc: {test_accuracy:.3f}')

plt.tight_layout()
plt.show()

### Support Vector Machine (SVM)

SVMs find the best "decision boundary" that separates different classes with the largest possible margin.

**Key hyperparameters:**
- `C`: Controls the trade-off between having a wide margin and classifying training points correctly
- `kernel`: The type of function used to separate data (linear, polynomial, etc.)

In [None]:
# TODO: Try to experiment with hyperparameters
# You can check this page: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
C_VALUE =   # TODO 1: Try to experiment with c_calue
KERNEL =  # TODO 2: Try to experiment with kernel
# TODO 3: Try to experiment with the others hyperparameters

plt.figure(figsize=(15, 10))

# Train the model
svm = SVC(C=C_VALUE, kernel=KERNEL, random_state=42)
svm.fit(X_train, y_train)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, svm.predict(X_train))
test_accuracy = accuracy_score(y_test, svm.predict(X_test))

# For visualization, we'll use only first two features
plt.subplot(2, 2, i+1)

# Create a mesh to plot decision boundaries
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                        np.arange(y_min, y_max, 0.02))

# Train SVM on first two features for visualization
svm_vis = SVC(C=C_VALUE, kernel='linear', random_state=42)
svm_vis.fit(X_train[:, :2], y_train)
Z = svm_vis.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision boundary and margins
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)

# Plot training points
for class_value in range(len(target_names)):
    plt.scatter(X_train[y_train == class_value, 0], 
                X_train[y_train == class_value, 1], 
                label=target_names[class_value], 
                alpha=0.8,
                edgecolors='black')

plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title(f'Train Acc: {train_accuracy:.3f}, Test Acc: {test_accuracy:.3f}')
plt.legend()

plt.tight_layout()
plt.show()

## 5. Evaluating Our Models

Let's compare our best models and see how they perform on the test data.

We'll use:
- **Accuracy**: Percentage of correct predictions
- **Confusion Matrix**: Shows which classes are being confused
- **Classification Report**: Detailed performance metrics

In [None]:
# Train our final models with optimized parameters
best_dt = DecisionTreeClassifier(max_depth=MAX_DEPTH, min_samples_leaf=MIN_SAMPLES_LEAF, random_state=42)
best_svm = SVC(C=C_VALUE, kernel=KERNEL, random_state=42)

best_dt.fit(X_train, y_train)
best_svm.fit(X_train, y_train)

# Make predictions
y_pred_dt = best_dt.predict(X_test)
y_pred_svm = best_svm.predict(X_test)

# Calculate accuracies
dt_accuracy = accuracy_score(y_test, y_pred_dt)
svm_accuracy = accuracy_score(y_test, y_pred_svm)

print("=== Model Performance Comparison ===")
print(f"Decision Tree Accuracy: {dt_accuracy:.3f}")
print(f"SVM Accuracy: {svm_accuracy:.3f}")

In [None]:
# Confusion matrices
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Decision Tree confusion matrix
cm_dt = confusion_matrix(y_test, y_pred_dt)
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names, ax=ax1)
ax1.set_title('Decision Tree - Confusion Matrix')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')

# SVM confusion matrix
cm_svm = confusion_matrix(y_test, y_pred_svm)
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names, ax=ax2)
ax2.set_title('SVM - Confusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')

plt.tight_layout()
plt.show()