In [4]:
#Exercise 1: Imputer 1
import numpy as np
from sklearn.impute import SimpleImputer 

# Training data with missing values
train_data = [[7, 6, 5],
              [4, np.nan, 5],
              [1, 20, 8]]

# Create SimpleImputer (default strategy = "mean")
imputer = SimpleImputer(strategy="mean")

# Fit the imputer on the training data
imputer.fit(train_data)

# Print statistics_ (mean of each column)
print("Imputer statistics_:",imputer.statistics_)


# Transform train_data (fill missing values)
train_filled = imputer.transform(train_data)
print("\nTransformed train_data:\n", train_filled)

# Test data
test_data = [[np.nan, 1, 2],
             [7, np.nan, 9],
             [np.nan, 2, 4]]

# Transform test_data using the same imputer (trained on train_data)
test_filled = imputer.transform(test_data)
print("\nTransformed test_data:\n", test_filled)


Imputer statistics_: [ 4. 13.  6.]

Transformed train_data:
 [[ 7.  6.  5.]
 [ 4. 13.  5.]
 [ 1. 20.  8.]]

Transformed test_data:
 [[ 4.  1.  2.]
 [ 7. 13.  9.]
 [ 4.  2.  4.]]


In [13]:
# Exercise 2: Scaler
import numpy as np
from sklearn.preprocessing import StandardScaler


# Training data
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

# Create scaler
scaler = StandardScaler()

# Fit on training data and transform
X_train_scaled = scaler.fit_transform(X_train)
print("\nX_train:\n", repr(X_train_scaled))

# Test data
X_test = np.array([[ 2., -1.,  1.],
                   [ 3.,  3., -1.],
                   [ 1.,  1.,  1.]])

# Transform test data (using the same scaler)
X_test_scaled = scaler.transform(X_test)
print("\nScaled X_test:\n", repr(X_test_scaled)


X_train:
 array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

Scaled X_test:
 array([[ 1.22474487, -1.22474487,  0.53452248],
       [ 2.44948974,  3.67423461, -1.06904497],
       [ 0.        ,  1.22474487,  0.53452248]])


In [17]:
# Exercise 3: One hot Encoder
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Training data
X_train = [['Python'], ['Java'], ['Java'], ['C++']]

# Create encoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Fit and transform on training data
X_train_encoded = encoder.fit_transform(X_train).toarray()

# Get categories (each element is a tuple)
categories = encoder.categories_[0]

# Create DataFrame with categories as column names
df_train = pd.DataFrame(X_train_encoded.astype(int), columns=categories)

print("Train one-hot encoded:\n", df_train)

# Test data
X_test = [['Python'], ['Java'], ['C'], ['C++']]

# Transform test data
X_test_encoded = encoder.transform(X_test).toarray()

df_test = pd.DataFrame(X_test_encoded.astype(int), columns=categories)

print("\nTest one-hot encoded:\n", df_test)


Train one-hot encoded:
    C++  Java  Python
0    0     0       1
1    0     1       0
2    0     1       0
3    1     0       0

Test one-hot encoded:
    C++  Java  Python
0    0     0       1
1    0     1       0
2    0     0       0
3    1     0       0


In [21]:
# Exercise 4: Ordinal Encoder
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

# Training data
X_train = [['good'], ['bad'], ['neutral']]

# Define the category order
encoder = OrdinalEncoder(categories=[['bad', 'neutral', 'good']])

# Fit and transform the training data
X_train_encoded = encoder.fit_transform(X_train)

print("Transformed X_train:\n", repr(X_train_encoded))

# Print the categories_
print("\nCategories:", encoder.categories_)

# Test data
X_test = [['good'], ['good'], ['bad']]

# Transform test data using the fitted encoder
X_test_encoded = encoder.transform(X_test)
print("\nTransformed X_test:\n", repr(X_test_encoded))

Transformed X_train:
 array([[2.],
       [0.],
       [1.]])

Categories: [array(['bad', 'neutral', 'good'], dtype=object)]

Transformed X_test:
 array([[2.],
       [2.],
       [0.]])


In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer

# 1. Load dataset
df = pd.read_csv("breast-cancer.csv", header=None)

df.columns = [
    "age", "menopause", "tumor-size", "inv-nodes", "node-caps",
    "deg-malig", "breast", "breast-quad", "irradiat", "Class"
]

# Drop NaN if present
df = df.dropna()

# 2. Train-test split
X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=43
)

# 3. Question 1: Unique values
print(X_train.nunique())

# 4. Question 2: OneHotEncoder on nominal features
ohe_cols = ["node-caps", "breast", "breast-quad", "irradiat"]
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
ohe.fit(X_train[ohe_cols])

print("\n# Question 2 - First 10 rows of OHE transform on test set")
print(repr(ohe.transform(X_test[ohe_cols])[:10]))

print("\n# OHE feature names")
print(ohe.get_feature_names_out(ohe_cols))

# 5. Question 3: OrdinalEncoder on ordinal features
ordinal_cols = ["menopause", "age", "tumor-size", "inv-nodes", "deg-malig"]

ordinal_categories = [
    ["lt40", "premeno", "ge40"],  # menopause
    ["10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90-99"],  # age
    ["0-4","5-9","10-14","15-19","20-24","25-29","30-34","35-39","40-44",
     "45-49","50-54","55-59"],  # tumor-size
    ["0-2","3-5","6-8","9-11","12-14","15-17","18-20","21-23","24-26",
     "27-29","30-32","33-35","36-39"],  # inv-nodes
    [1, 2, 3]  # deg-malig
]

oe = OrdinalEncoder(categories=ordinal_categories)
oe.fit(X_train[ordinal_cols])

print("\n# First 10 rows of OrdinalEncoder transform on test set")
print(repr(oe.transform(X_test[ordinal_cols])[:10]))

# 6. Question 4: Combine with ColumnTransformer
preprocessor = make_column_transformer(
    (oe, ordinal_cols),
    (ohe, ohe_cols),
    remainder="drop"
)

preprocessor.fit(X_train)

print("\n# First 2 rows of ColumnTransformer applied to test set")
print(repr(preprocessor.transform(X_test)[:2]))


age             6
menopause       3
tumor-size     11
inv-nodes       6
node-caps       2
deg-malig       3
breast          2
breast-quad     5
irradiat        2
dtype: int64

# Question 2 - First 10 rows of OHE transform on test set
array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.]])

# OHE feature names
['node-caps_no' 'node-caps_yes' 'breast_left' 'breast_right'
 'breast-quad_central' 'breast-quad_left_low' 'breast-quad_left_up'
 'breast-quad_right_low' 'breast-quad_right_up' 'irradiat_no'
 'irradiat_yes']

# Fir

In [61]:
#Exercise 6: Pipeline

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


# Load iris dataset
iris = load_iris()
X, y = iris['data'], iris['target']

# Add missing values
X[[1, 20, 50, 100, 135], 0] = np.nan
X[[2, 5, 88, 135], 1] = np.nan
X[[4, 15], 2] = np.nan
X[[40, 135], 3] = np.nan

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=43
)

# Define pipeline
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000, random_state=43))
])

# Fit pipeline
pipeline.fit(X_train, y_train)

# Evaluate on test set
score = pipeline.score(X_test, y_test)

print(f"Model score on test set: {int(score * 100)}%")


Model score on test set: 98%
