# Feature Importance with Ensembles

In [17]:
# Imports
from sklearn.datasets import load_digits, fetch_openml
from sklearn.preprocessing import OrdinalEncoder
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Step 0 -- Data Exploration & Preprocessing

We will use three datasets throughout this tutorial:

1. **Adult** – Tabular classification
2. **Student Performance** – Tabular multi-target regression
3. **Digits** – Image classification

Let's explore each of them to understand their structure, features, and targets.

#### Adult: Tabular classification

In [13]:
# Adult dataset
data_adult = fetch_openml("adult", version=2, as_frame=True)

# split into features (X) and target (y)
X_adult = data_adult.data
y_adult = data_adult.target

# for illustration purposes, let's work with only a subset of features
feature_subset = ['age', 'education-num', 'race', 'sex', 'hours-per-week']
X_adult = X_adult[feature_subset]

In [None]:
# Features
print("Shape:", X_adult.shape)
X_adult.head()

In [None]:
# Target
print("Shape:", y_adult.shape)
y_adult.head()

In [None]:
# Preprocessing

# Identify categorical columns
categorical_cols = X_adult.select_dtypes(include="category").columns

# Apply ordinal encoding
encoder = OrdinalEncoder()
X_adult = X_adult.copy()
X_adult.loc[:, categorical_cols] = encoder.fit_transform(X_adult[categorical_cols])

<div style="border: 1px solid rgb(0, 0, 0); background-color:rgb(224, 179, 83); padding: 10px; border-radius: 5px; color: white;">
<b>❓ Question:</b> How did one-hot encoding change the data?
</div>

In [10]:
# your code
...

#### Student Performance: Tabular multi-target regression

In [19]:
# Student performance dataset
data_students = fetch_ucirepo(id=320) 

# split into features and target
X_students = data_students.data.features 
Y_students = data_students.data.targets 

In [None]:
# Features
print("Shape:", X_students.shape)
X_students.head()

In [None]:
# Targets
print("Shape:", Y_students.shape)
Y_students.head()

In [None]:
# Check the grade range
Y_students.describe()

In [14]:
# Preprocessing

# Encode categorical features
X_students = pd.get_dummies(X_students)

#### Digits: Image classification

In [9]:
# Digits dataset
data_digits = load_digits()

# split into features and target
X_digits = data_digits.data
y_digits = data_digits.target

In [None]:
print("Shape:", X_digits.shape)
print("Target classes:", np.unique(y_digits))

In [None]:
# Show first few images
fig, axes = plt.subplots(1, 5, figsize=(10, 3))
for i, ax in enumerate(axes):
    ax.imshow(data_digits.images[i], cmap="gray")
    ax.set_title(f"Label: {y_digits[i]}")
    ax.axis("off")
plt.suptitle("Digits Dataset – Sample Images")
plt.show()