From bec1a4a9e4d982336af742c5a0c1549530dc2d00 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 10 Sep 2025 11:48:58 +0200
Subject: [PATCH 1/5] commented the n_jobs =2

---
 python_scripts/ensemble_hist_gradient_boosting.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python_scripts/ensemble_hist_gradient_boosting.py b/python_scripts/ensemble_hist_gradient_boosting.py
index 7a40a569d..7883366bd 100644
--- a/python_scripts/ensemble_hist_gradient_boosting.py
+++ b/python_scripts/ensemble_hist_gradient_boosting.py
@@ -58,7 +58,7 @@
     data,
     target,
     scoring="neg_mean_absolute_error",
-    n_jobs=2,
+    # n_jobs=2, # Uncomment this line if you run locally
 )
 
 # %%
@@ -122,7 +122,7 @@
     data,
     target,
     scoring="neg_mean_absolute_error",
-    n_jobs=2,
+    # n_jobs=2, # Uncomment this line if you run locally
 )
 
 # %%
@@ -161,7 +161,7 @@
     data,
     target,
     scoring="neg_mean_absolute_error",
-    n_jobs=2,
+    # n_jobs=2, # Uncomment this line if you run locally
 )
 
 # %%

From 01ca3a826508b8faf8a7495cd997174458f97392 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 17 Sep 2025 10:17:17 +0200
Subject: [PATCH 2/5] added changes in other notebooks

---
 .../clustering/clustering_quiz_m4_01.md       | 89 +++++++++++++++++++
 .../clustering/clustering_quiz_m4_02.md       | 45 ++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 jupyter-book/clustering/clustering_quiz_m4_01.md
 create mode 100644 jupyter-book/clustering/clustering_quiz_m4_02.md

diff --git a/jupyter-book/clustering/clustering_quiz_m4_01.md b/jupyter-book/clustering/clustering_quiz_m4_01.md
new file mode 100644
index 000000000..71411e377
--- /dev/null
+++ b/jupyter-book/clustering/clustering_quiz_m4_01.md
@@ -0,0 +1,89 @@
+# ✅ Quiz M4.01
+
+```{admonition} Question
+Imagine you work for a music streaming platform that hosts a vast library of
+songs, playlists, and podcasts. You have access to detailed listening data from
+millions of users. For each user, you know their most-listened genres, the
+devices they use, their average session length, and how often they explore new
+content.
+
+You want to segment users based on their listening patterns to improve
+personalized recommendations, without relying on rigid, predefined labels like
+"pop fan" or "casual listener" which may fail to capture the complexity of
+their behavior.
+
+What kind of problem are you dealing with?
+
+- a) a supervised task
+- b) an unsupervised task
+- c) a classification task
+- d) a clustering task
+
+_Select all answers that apply_
+```
+
++++
+
+```{admonition} Question
+The plots below show the cluster labels as found by k-means with 3 clusters, only
+differing in the scaling step. Based on this, which conclusions can be obtained?
+
+![K-means on original features](../../figures/evaluation_quiz_kmeans_not_scaled.svg)
+![K-means on scaled features](../../figures/evaluation_quiz_kmeans_scaled.svg)
+
+- a) without scaling, cluster assignment is dominated by the feature in the vertical axis
+- b) without scaling, cluster assignment is dominated by the feature in the horizontal axis
+- c) without scaling, both features contribute equally to cluster assignment
+
+_Select a single answer_
+```
+
++++
+
+```{admonition} Question
+Which of the following statements correctly describe factors that affect the
+stability of k-means clustering across different resampling iterations of the data?
+
+- a) K-means can produce different results on resampled datasets due to
+  sensitivity to initialization.
+- b) If data is unevenly distributed, the stability improves when increasing the
+  parameter `n_init` in the "k-means++" initialization.
+- c) Stability under resampling is guaranteed after feature scaling.
+- d) Increasing the number of clusters always reduces the variability of
+  results across resamples.
+
+_Select all answers that apply_
+```
+
++++
+
+```{admonition} Question
+Which of the following statements correctly describe how WCSS (within-cluster
+sum of squares, or inertia) behaves in k-means clustering?
+
+- a) For a fixed number of clusters, WCSS is lower when clusters are compact.
+- b) For a fixed number of clusters, WCSS is lower for wider clusters.
+- c) For a fixed number of clusters, lower WCSS implies lower computational cost
+  during training.
+- d) Assuming `n_init` is large enough to ensure convergence, WCSS always
+  decreases as the number of clusters increases.
+
+_Select all answers that apply_
+```
+
++++
+
+```{admonition} Question
+Which of the following statements correctly describe differences between
+supervised and unsupervised clustering metrics?
+
+- a) Supervised clustering metrics such as ARI and AMI require access to ground
+  truth labels to evaluate clustering performance.
+- b) WCSS and the silhouette score evaluate internal cluster structure without
+  needing reference labels.
+- c) V-measure is zero when labels are assigned completely at random.
+- d) Supervised clustering metrics are not useful if the number of clusters does
+  not match the number of predefined classes.
+
+_Select all answers that apply_
+```
diff --git a/jupyter-book/clustering/clustering_quiz_m4_02.md b/jupyter-book/clustering/clustering_quiz_m4_02.md
new file mode 100644
index 000000000..20edb0115
--- /dev/null
+++ b/jupyter-book/clustering/clustering_quiz_m4_02.md
@@ -0,0 +1,45 @@
+# ✅ Quiz M4.02
+
+```{admonition} Question
+If we increase `min_cluster_size` in HDBSCAN, what happens to the number of
+points labeled as noise?
+
+- a) It decreases.
+- b) It increases.
+- c) It stays the same.
+- d) HDBSCAN fails to converge.
+
+_Select a single answer_
+
+```
+
++++
+
+```{admonition} Question
+What happens to k-means centroids in the presence of outliers?
+
+- a) They move towards the outliers assigned to their cluster.
+- b) They are not sensitive to outliers.
+- c) If a centroid is initialized on an outlier, it may remain isolated in
+  subsequent iterations.
+
+_Select all answers that apply_
+
+```
+
++++
+
+```{admonition} Question
+A `KMeans` instance with `n_clusters=10` is used to transform the latitude and
+longitude in a supervised learning pipeline. Provided the original dataset consists of
+`n_features`, including those two, how many features are passed to
+the final estimator of the pipeline?
+
+- a) `n_features` + 10
+- b) `n_features` + 8
+- c) `n_features` - 2
+- d) `n_features`
+
+_Select a single answer_
+
+```

From 9d10869183c158f733355280d6ac29ee3ebd77f6 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 17 Sep 2025 10:34:59 +0200
Subject: [PATCH 3/5] Revert "added changes in other notebooks"

This reverts commit 01ca3a826508b8faf8a7495cd997174458f97392.
---
 .../clustering/clustering_quiz_m4_01.md       | 89 -------------------
 .../clustering/clustering_quiz_m4_02.md       | 45 ----------
 2 files changed, 134 deletions(-)
 delete mode 100644 jupyter-book/clustering/clustering_quiz_m4_01.md
 delete mode 100644 jupyter-book/clustering/clustering_quiz_m4_02.md

diff --git a/jupyter-book/clustering/clustering_quiz_m4_01.md b/jupyter-book/clustering/clustering_quiz_m4_01.md
deleted file mode 100644
index 71411e377..000000000
--- a/jupyter-book/clustering/clustering_quiz_m4_01.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# ✅ Quiz M4.01
-
-```{admonition} Question
-Imagine you work for a music streaming platform that hosts a vast library of
-songs, playlists, and podcasts. You have access to detailed listening data from
-millions of users. For each user, you know their most-listened genres, the
-devices they use, their average session length, and how often they explore new
-content.
-
-You want to segment users based on their listening patterns to improve
-personalized recommendations, without relying on rigid, predefined labels like
-"pop fan" or "casual listener" which may fail to capture the complexity of
-their behavior.
-
-What kind of problem are you dealing with?
-
-- a) a supervised task
-- b) an unsupervised task
-- c) a classification task
-- d) a clustering task
-
-_Select all answers that apply_
-```
-
-+++
-
-```{admonition} Question
-The plots below show the cluster labels as found by k-means with 3 clusters, only
-differing in the scaling step. Based on this, which conclusions can be obtained?
-
-![K-means on original features](../../figures/evaluation_quiz_kmeans_not_scaled.svg)
-![K-means on scaled features](../../figures/evaluation_quiz_kmeans_scaled.svg)
-
-- a) without scaling, cluster assignment is dominated by the feature in the vertical axis
-- b) without scaling, cluster assignment is dominated by the feature in the horizontal axis
-- c) without scaling, both features contribute equally to cluster assignment
-
-_Select a single answer_
-```
-
-+++
-
-```{admonition} Question
-Which of the following statements correctly describe factors that affect the
-stability of k-means clustering across different resampling iterations of the data?
-
-- a) K-means can produce different results on resampled datasets due to
-  sensitivity to initialization.
-- b) If data is unevenly distributed, the stability improves when increasing the
-  parameter `n_init` in the "k-means++" initialization.
-- c) Stability under resampling is guaranteed after feature scaling.
-- d) Increasing the number of clusters always reduces the variability of
-  results across resamples.
-
-_Select all answers that apply_
-```
-
-+++
-
-```{admonition} Question
-Which of the following statements correctly describe how WCSS (within-cluster
-sum of squares, or inertia) behaves in k-means clustering?
-
-- a) For a fixed number of clusters, WCSS is lower when clusters are compact.
-- b) For a fixed number of clusters, WCSS is lower for wider clusters.
-- c) For a fixed number of clusters, lower WCSS implies lower computational cost
-  during training.
-- d) Assuming `n_init` is large enough to ensure convergence, WCSS always
-  decreases as the number of clusters increases.
-
-_Select all answers that apply_
-```
-
-+++
-
-```{admonition} Question
-Which of the following statements correctly describe differences between
-supervised and unsupervised clustering metrics?
-
-- a) Supervised clustering metrics such as ARI and AMI require access to ground
-  truth labels to evaluate clustering performance.
-- b) WCSS and the silhouette score evaluate internal cluster structure without
-  needing reference labels.
-- c) V-measure is zero when labels are assigned completely at random.
-- d) Supervised clustering metrics are not useful if the number of clusters does
-  not match the number of predefined classes.
-
-_Select all answers that apply_
-```
diff --git a/jupyter-book/clustering/clustering_quiz_m4_02.md b/jupyter-book/clustering/clustering_quiz_m4_02.md
deleted file mode 100644
index 20edb0115..000000000
--- a/jupyter-book/clustering/clustering_quiz_m4_02.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# ✅ Quiz M4.02
-
-```{admonition} Question
-If we increase `min_cluster_size` in HDBSCAN, what happens to the number of
-points labeled as noise?
-
-- a) It decreases.
-- b) It increases.
-- c) It stays the same.
-- d) HDBSCAN fails to converge.
-
-_Select a single answer_
-
-```
-
-+++
-
-```{admonition} Question
-What happens to k-means centroids in the presence of outliers?
-
-- a) They move towards the outliers assigned to their cluster.
-- b) They are not sensitive to outliers.
-- c) If a centroid is initialized on an outlier, it may remain isolated in
-  subsequent iterations.
-
-_Select all answers that apply_
-
-```
-
-+++
-
-```{admonition} Question
-A `KMeans` instance with `n_clusters=10` is used to transform the latitude and
-longitude in a supervised learning pipeline. Provided the original dataset consists of
-`n_features`, including those two, how many features are passed to
-the final estimator of the pipeline?
-
-- a) `n_features` + 10
-- b) `n_features` + 8
-- c) `n_features` - 2
-- d) `n_features`
-
-_Select a single answer_
-
-```

From 17c6f50ebebc817de5ac31063c9ca4c50b4986be Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 17 Sep 2025 10:36:46 +0200
Subject: [PATCH 4/5] corrected the mistake <3

---
 python_scripts/ensemble_hyperparameters.py | 4 ++--
 python_scripts/ensemble_sol_04.py          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index 460854001..bd9a14a58 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -96,7 +96,7 @@
     scoring="neg_mean_absolute_error",
     n_iter=10,
     random_state=0,
-    n_jobs=2,
+    # n_jobs=2, # Uncomment this line if you run locally
 )
 search_cv.fit(data_train, target_train)
 
@@ -184,7 +184,7 @@
     scoring="neg_mean_absolute_error",
     n_iter=20,
     random_state=0,
-    n_jobs=2,
+    # n_jobs=2, # Uncomment this line if you run locally
 )
 search_cv.fit(data_train, target_train)
 
diff --git a/python_scripts/ensemble_sol_04.py b/python_scripts/ensemble_sol_04.py
index 071e5877a..96c37bcda 100644
--- a/python_scripts/ensemble_sol_04.py
+++ b/python_scripts/ensemble_sol_04.py
@@ -70,7 +70,7 @@
 
 cv = KFold(n_splits=5, shuffle=True, random_state=0)
 results = cross_validate(
-    search, data, target, cv=cv, return_estimator=True, n_jobs=2
+    search, data, target, cv=cv, return_estimator=True, # n_jobs=2 # Uncomment this if you run locally
 )
 
 # %% [markdown]

From 3b3307258db0cd322e8496e327f3ec16740928a0 Mon Sep 17 00:00:00 2001
From: SebastienMelo <seastien.melo@polytechnique.edu>
Date: Wed, 17 Sep 2025 10:39:57 +0200
Subject: [PATCH 5/5] line length fix + double space

---
 python_scripts/ensemble_hist_gradient_boosting.py | 6 +++---
 python_scripts/ensemble_hyperparameters.py        | 2 +-
 python_scripts/ensemble_sol_04.py                 | 7 ++++++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/python_scripts/ensemble_hist_gradient_boosting.py b/python_scripts/ensemble_hist_gradient_boosting.py
index 7883366bd..7f0d7b47b 100644
--- a/python_scripts/ensemble_hist_gradient_boosting.py
+++ b/python_scripts/ensemble_hist_gradient_boosting.py
@@ -58,7 +58,7 @@
     data,
     target,
     scoring="neg_mean_absolute_error",
-    # n_jobs=2, # Uncomment this line if you run locally
+    # n_jobs=2,  # Uncomment this line if you run locally
 )
 
 # %%
@@ -122,7 +122,7 @@
     data,
     target,
     scoring="neg_mean_absolute_error",
-    # n_jobs=2, # Uncomment this line if you run locally
+    # n_jobs=2,  # Uncomment this line if you run locally
 )
 
 # %%
@@ -161,7 +161,7 @@
     data,
     target,
     scoring="neg_mean_absolute_error",
-    # n_jobs=2, # Uncomment this line if you run locally
+    # n_jobs=2,  # Uncomment this line if you run locally
 )
 
 # %%
diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py
index bd9a14a58..c7af37728 100644
--- a/python_scripts/ensemble_hyperparameters.py
+++ b/python_scripts/ensemble_hyperparameters.py
@@ -96,7 +96,7 @@
     scoring="neg_mean_absolute_error",
     n_iter=10,
     random_state=0,
-    # n_jobs=2, # Uncomment this line if you run locally
+    # n_jobs=2,  # Uncomment this line if you run locally
 )
 search_cv.fit(data_train, target_train)
 
diff --git a/python_scripts/ensemble_sol_04.py b/python_scripts/ensemble_sol_04.py
index 96c37bcda..b31393077 100644
--- a/python_scripts/ensemble_sol_04.py
+++ b/python_scripts/ensemble_sol_04.py
@@ -70,7 +70,12 @@
 
 cv = KFold(n_splits=5, shuffle=True, random_state=0)
 results = cross_validate(
-    search, data, target, cv=cv, return_estimator=True, # n_jobs=2 # Uncomment this if you run locally
+    search,
+    data,
+    target,
+    cv=cv,
+    return_estimator=True,
+    # n_jobs=2  # Uncomment this if you run locally
 )
 
 # %% [markdown]