# Exercise 1 

In [1]:
import os
import sys
import numpy as np

from si.io.csv_file import read_csv

In [11]:
def solve_iris_indexing():
    iris_path = os.path.join('..', 'datasets', 'iris', 'iris.csv')
    dataset = read_csv(filename=iris_path, sep=',', features=True, label=True)
    # Penultimate feature vector
    x_penult = dataset.X[:, -2]
    penult_shape = np.shape(x_penult)
    # Last 10 samples for per-feature mean
    x_last10 = dataset.X[-10:, :]
    means_last10 = np.mean(x_last10, axis=0)
    # Count samples where all feature values <= 6
    mask_le6 = np.all(dataset.X <= 6, axis=1)
    count_le6 = int(np.sum(mask_le6))
    # Count samples whose label is not Iris-setosa
    mask_not_setosa = dataset.y != 'Iris-setosa'
    count_not_setosa = int(np.sum(mask_not_setosa))
    # Package results in dict for downstream cells
    return {
        'dataset_shape': dataset.shape(),
        'X': dataset.X,
        'features': dataset.features,
        'label': dataset.label,
        'penult_shape': penult_shape,
        'means_last10': means_last10,
        'mask_le6_count': count_le6,
        'not_setosa_count': count_not_setosa,
    }

In [12]:
results = solve_iris_indexing()
print('1.1) Load the iris.csv using the appropriate method:')
print('Shape of iris dataset:', results['dataset_shape'])
print(results['X'])

1.1) Load the iris.csv using the appropriate method:
Shape of iris dataset: (150, 4)
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5

In [13]:
print('1.2) Select the penultimate independent variable. Dimension of the resulting array:', results['penult_shape'])

1.2) Select the penultimate independent variable. Dimension of the resulting array: (150,)


In [14]:
print('1.3) Mean of the last 10 samples for each independent variable/feature:')
for feat, mean in zip(results['features'], results['means_last10']):
    print(f'  - {feat}: {float(mean)}')

1.3) Mean of the last 10 samples for each independent variable/feature:
  - sepal_length: 6.450000000000001
  - sepal_width: 3.0300000000000002
  - petal_length: 5.33
  - petal_width: 2.17


In [15]:
print('1.4) Number of samples with values <= 6 for all independent variables/features:', results['mask_le6_count'])

1.4) Number of samples with values <= 6 for all independent variables/features: 89


In [16]:
print('1.5) Number of samples with a class/label different from "Iris-setosa":', results['not_setosa_count'])

1.5) Number of samples with a class/label different from "Iris-setosa": 100


# Exercise 2

In [4]:
from si.data.dataset import Dataset

In [5]:
# 2.1) dropna 

# Create a small dataset with NaNs and a target vector
drop_X = np.array([[1.0, 2.0, np.nan], [3.0, 4.0, 5.0], [np.nan, 6.0, 7.0]])
drop_y = np.array([10, 20, 30])
ds_drop = Dataset(drop_X, drop_y, label="Y")

print("-- 2.1) dropna --")
# Show shapes and full arrays before cleaning
print("Before:\n X shape:", ds_drop.X.shape, " y shape:", ds_drop.y.shape)
print("X:\n", ds_drop.X)
print("y:", ds_drop.y)

# Remove rows containing any NaN in X (y is kept in sync)
ds_drop.dropna()

# Show shapes and arrays after dropping invalid rows
print("After dropna:\n X shape:", ds_drop.X.shape, " y shape:", ds_drop.y.shape)
print("X:\n", ds_drop.X)
print("y:", ds_drop.y)

-- 2.1) dropna --
Before:
 X shape: (3, 3)  y shape: (3,)
X:
 [[ 1.  2. nan]
 [ 3.  4.  5.]
 [nan  6.  7.]]
y: [10 20 30]
After dropna:
 X shape: (1, 3)  y shape: (1,)
X:
 [[3. 4. 5.]]
y: [20]


In [6]:
# 2.2) fillna (median) 

# Create a dataset with NaNs to demonstrate imputation
fill_X = np.array([[30.0, 2.0, np.nan], [3.0, 4.0, 5.0], [np.nan, 6.0, 7.0]])
ds_fill = Dataset(fill_X)

print("-- 2.2) fillna --")
# Show data before imputation
print("Before:\n", ds_fill.X)

# Compute per-feature medians ignoring NaNs 
medians = np.nanmedian(ds_fill.X, axis=0)
print("Median per feature:", medians)

# Replace NaNs with the median of each feature
ds_fill.fillna(value="median")

# Show data after imputation
print("After fillna(median):\n", ds_fill.X)

-- 2.2) fillna --
Before:
 [[30.  2. nan]
 [ 3.  4.  5.]
 [nan  6.  7.]]
Median per feature: [16.5  4.   6. ]
After fillna(median):
 [[30.   2.   6. ]
 [ 3.   4.   5. ]
 [16.5  6.   7. ]]


In [7]:
# 2.3) remove_by_index 

# Create a small dataset and a target vector
rm_X = np.array([[30.0, 2.0, 0.0], [3.0, 4.0, 5.0], [1.0, 6.0, 7.0]])
rm_y = np.array([10, 20, 30])
ds_rm = Dataset(rm_X, rm_y, label="Y")

print("-- 2.3) remove_by_index --")
# Show full data before removing any row
print("Before:\n X:\n", ds_rm.X, "\n y:", ds_rm.y)

# Remove a single row by its index; X and y stay aligned
ds_rm.remove_by_index(1)

# Show resulting arrays after the removal
print("After remove_by_index(1):\n X:\n", ds_rm.X, "\n y:", ds_rm.y)

-- 2.3) remove_by_index --
Before:
 X:
 [[30.  2.  0.]
 [ 3.  4.  5.]
 [ 1.  6.  7.]] 
 y: [10 20 30]
After remove_by_index(1):
 X:
 [[30.  2.  0.]
 [ 1.  6.  7.]] 
 y: [10 30]
