# Data

In [19]:
# Imports
from sklearn.datasets import load_digits, fetch_openml
from sklearn.preprocessing import OrdinalEncoder
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Step 0 -- Data Exploration & Preprocessing

We will use three datasets throughout this tutorial:

1. **Adult** – Tabular classification
2. **Student Performance** – Tabular multi-target regression
3. **Digits** – Image classification

Let's explore each of them to understand their structure, features, and targets.

#### Adult: Tabular classification

Contains demographic information extracted from the 1994 US Census database. The task is to predict whether a person makes over 50K USD a year. 

Source: [Preprocessed (we use this here)](https://www.openml.org/search?type=data&sort=runs&id=179&status=active) | [Original](https://archive.ics.uci.edu/dataset/2/adult)

In [20]:
# Adult dataset
data_adult = fetch_openml("adult", version=2, as_frame=True)

# split into features (X) and target (y)
X_adult = data_adult.data
y_adult = data_adult.target

# for illustration purposes, let's work with only a subset of features
feature_subset = ['age', 'education-num', 'race', 'sex', 'hours-per-week']
X_adult = X_adult[feature_subset]

In [None]:
# Features
print("Shape:", X_adult.shape)
X_adult.head()

In [None]:
# Target
print("Shape:", y_adult.shape)
y_adult.head()

In [23]:
# Preprocessing

# Identify categorical columns
categorical_cols = X_adult.select_dtypes(include="category").columns

# Apply ordinal encoding
encoder = OrdinalEncoder()
X_adult = X_adult.copy()
X_adult.loc[:, categorical_cols] = encoder.fit_transform(X_adult[categorical_cols])

<div style="border: 1px solid #ffcc00; background-color: #fff8e1; padding: 10px; border-radius: 5px; color: black;">
<b>❓ Question:</b> How did one-hot encoding change the data?
</div>

In [24]:
# your code
...

#### Student Performance: Tabular multi-target regression

The dataset contains student achievement data from two Portuguese secondary schools, covering demographics, social factors, and school-related attributes.
Prediction targets are the grades G1, G2, and G3 of the first, second, and third year, respectively.

Source: [UCI](https://archive.ics.uci.edu/dataset/320/student+performance)

In [25]:
# Student performance dataset
data_students = fetch_ucirepo(id=320) 

# split into features and target
X_students = data_students.data.features 
Y_students = data_students.data.targets 

In [None]:
# Features
print("Shape:", X_students.shape)
X_students.head()

In [None]:
# Targets
print("Shape:", Y_students.shape)
Y_students.head()

In [None]:
# Check the grade range
Y_students.describe()

In [29]:
# Preprocessing

# Encode categorical features
X_students = pd.get_dummies(X_students)

#### Digits: Image classification

Images of hand-written digits from different people, here in low resolution (8x8) and grayscale (0-16). The dataset is widely used under the name MNIST.

Source: [preprocessed (we use this here)](https://scikit-learn.org/stable/datasets/toy_dataset.html#digits-dataset) | [original](https://archive.ics.uci.edu/dataset/80/optical+recognition+of+handwritten+digits)

In [30]:
# Digits dataset
data_digits = load_digits()

# split into features and target
X_digits = data_digits.data
y_digits = data_digits.target

In [None]:
print("Shape:", X_digits.shape)
print("Target classes:", np.unique(y_digits))

In [None]:
# Show first few images
fig, axes = plt.subplots(1, 5, figsize=(10, 3))
for i, ax in enumerate(axes):
    ax.imshow(data_digits.images[i], cmap="gray")
    ax.set_title(f"Label: {y_digits[i]}")
    ax.axis("off")
plt.suptitle("Digits Dataset – Sample Images")
plt.show()