From bec1a4a9e4d982336af742c5a0c1549530dc2d00 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 10 Sep 2025 11:48:58 +0200 Subject: [PATCH 1/5] commented the n_jobs =2 --- python_scripts/ensemble_hist_gradient_boosting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python_scripts/ensemble_hist_gradient_boosting.py b/python_scripts/ensemble_hist_gradient_boosting.py index 7a40a569d..7883366bd 100644 --- a/python_scripts/ensemble_hist_gradient_boosting.py +++ b/python_scripts/ensemble_hist_gradient_boosting.py @@ -58,7 +58,7 @@ data, target, scoring="neg_mean_absolute_error", - n_jobs=2, + # n_jobs=2, # Uncomment this line if you run locally ) # %% @@ -122,7 +122,7 @@ data, target, scoring="neg_mean_absolute_error", - n_jobs=2, + # n_jobs=2, # Uncomment this line if you run locally ) # %% @@ -161,7 +161,7 @@ data, target, scoring="neg_mean_absolute_error", - n_jobs=2, + # n_jobs=2, # Uncomment this line if you run locally ) # %% From 01ca3a826508b8faf8a7495cd997174458f97392 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 17 Sep 2025 10:17:17 +0200 Subject: [PATCH 2/5] added changes in other notebooks --- .../clustering/clustering_quiz_m4_01.md | 89 +++++++++++++++++++ .../clustering/clustering_quiz_m4_02.md | 45 ++++++++++ 2 files changed, 134 insertions(+) create mode 100644 jupyter-book/clustering/clustering_quiz_m4_01.md create mode 100644 jupyter-book/clustering/clustering_quiz_m4_02.md diff --git a/jupyter-book/clustering/clustering_quiz_m4_01.md b/jupyter-book/clustering/clustering_quiz_m4_01.md new file mode 100644 index 000000000..71411e377 --- /dev/null +++ b/jupyter-book/clustering/clustering_quiz_m4_01.md @@ -0,0 +1,89 @@ +# ✅ Quiz M4.01 + +```{admonition} Question +Imagine you work for a music streaming platform that hosts a vast library of +songs, playlists, and podcasts. You have access to detailed listening data from +millions of users. For each user, you know their most-listened genres, the +devices they use, their average session length, and how often they explore new +content. + +You want to segment users based on their listening patterns to improve +personalized recommendations, without relying on rigid, predefined labels like +"pop fan" or "casual listener" which may fail to capture the complexity of +their behavior. + +What kind of problem are you dealing with? + +- a) a supervised task +- b) an unsupervised task +- c) a classification task +- d) a clustering task + +_Select all answers that apply_ +``` + ++++ + +```{admonition} Question +The plots below show the cluster labels as found by k-means with 3 clusters, only +differing in the scaling step. Based on this, which conclusions can be obtained? + +![K-means on original features](../../figures/evaluation_quiz_kmeans_not_scaled.svg) +![K-means on scaled features](../../figures/evaluation_quiz_kmeans_scaled.svg) + +- a) without scaling, cluster assignment is dominated by the feature in the vertical axis +- b) without scaling, cluster assignment is dominated by the feature in the horizontal axis +- c) without scaling, both features contribute equally to cluster assignment + +_Select a single answer_ +``` + ++++ + +```{admonition} Question +Which of the following statements correctly describe factors that affect the +stability of k-means clustering across different resampling iterations of the data? + +- a) K-means can produce different results on resampled datasets due to + sensitivity to initialization. +- b) If data is unevenly distributed, the stability improves when increasing the + parameter `n_init` in the "k-means++" initialization. +- c) Stability under resampling is guaranteed after feature scaling. +- d) Increasing the number of clusters always reduces the variability of + results across resamples. + +_Select all answers that apply_ +``` + ++++ + +```{admonition} Question +Which of the following statements correctly describe how WCSS (within-cluster +sum of squares, or inertia) behaves in k-means clustering? + +- a) For a fixed number of clusters, WCSS is lower when clusters are compact. +- b) For a fixed number of clusters, WCSS is lower for wider clusters. +- c) For a fixed number of clusters, lower WCSS implies lower computational cost + during training. +- d) Assuming `n_init` is large enough to ensure convergence, WCSS always + decreases as the number of clusters increases. + +_Select all answers that apply_ +``` + ++++ + +```{admonition} Question +Which of the following statements correctly describe differences between +supervised and unsupervised clustering metrics? + +- a) Supervised clustering metrics such as ARI and AMI require access to ground + truth labels to evaluate clustering performance. +- b) WCSS and the silhouette score evaluate internal cluster structure without + needing reference labels. +- c) V-measure is zero when labels are assigned completely at random. +- d) Supervised clustering metrics are not useful if the number of clusters does + not match the number of predefined classes. + +_Select all answers that apply_ +``` diff --git a/jupyter-book/clustering/clustering_quiz_m4_02.md b/jupyter-book/clustering/clustering_quiz_m4_02.md new file mode 100644 index 000000000..20edb0115 --- /dev/null +++ b/jupyter-book/clustering/clustering_quiz_m4_02.md @@ -0,0 +1,45 @@ +# ✅ Quiz M4.02 + +```{admonition} Question +If we increase `min_cluster_size` in HDBSCAN, what happens to the number of +points labeled as noise? + +- a) It decreases. +- b) It increases. +- c) It stays the same. +- d) HDBSCAN fails to converge. + +_Select a single answer_ + +``` + ++++ + +```{admonition} Question +What happens to k-means centroids in the presence of outliers? + +- a) They move towards the outliers assigned to their cluster. +- b) They are not sensitive to outliers. +- c) If a centroid is initialized on an outlier, it may remain isolated in + subsequent iterations. + +_Select all answers that apply_ + +``` + ++++ + +```{admonition} Question +A `KMeans` instance with `n_clusters=10` is used to transform the latitude and +longitude in a supervised learning pipeline. Provided the original dataset consists of +`n_features`, including those two, how many features are passed to +the final estimator of the pipeline? + +- a) `n_features` + 10 +- b) `n_features` + 8 +- c) `n_features` - 2 +- d) `n_features` + +_Select a single answer_ + +``` From 9d10869183c158f733355280d6ac29ee3ebd77f6 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 17 Sep 2025 10:34:59 +0200 Subject: [PATCH 3/5] Revert "added changes in other notebooks" This reverts commit 01ca3a826508b8faf8a7495cd997174458f97392. --- .../clustering/clustering_quiz_m4_01.md | 89 ------------------- .../clustering/clustering_quiz_m4_02.md | 45 ---------- 2 files changed, 134 deletions(-) delete mode 100644 jupyter-book/clustering/clustering_quiz_m4_01.md delete mode 100644 jupyter-book/clustering/clustering_quiz_m4_02.md diff --git a/jupyter-book/clustering/clustering_quiz_m4_01.md b/jupyter-book/clustering/clustering_quiz_m4_01.md deleted file mode 100644 index 71411e377..000000000 --- a/jupyter-book/clustering/clustering_quiz_m4_01.md +++ /dev/null @@ -1,89 +0,0 @@ -# ✅ Quiz M4.01 - -```{admonition} Question -Imagine you work for a music streaming platform that hosts a vast library of -songs, playlists, and podcasts. You have access to detailed listening data from -millions of users. For each user, you know their most-listened genres, the -devices they use, their average session length, and how often they explore new -content. - -You want to segment users based on their listening patterns to improve -personalized recommendations, without relying on rigid, predefined labels like -"pop fan" or "casual listener" which may fail to capture the complexity of -their behavior. - -What kind of problem are you dealing with? - -- a) a supervised task -- b) an unsupervised task -- c) a classification task -- d) a clustering task - -_Select all answers that apply_ -``` - -+++ - -```{admonition} Question -The plots below show the cluster labels as found by k-means with 3 clusters, only -differing in the scaling step. Based on this, which conclusions can be obtained? - -![K-means on original features](../../figures/evaluation_quiz_kmeans_not_scaled.svg) -![K-means on scaled features](../../figures/evaluation_quiz_kmeans_scaled.svg) - -- a) without scaling, cluster assignment is dominated by the feature in the vertical axis -- b) without scaling, cluster assignment is dominated by the feature in the horizontal axis -- c) without scaling, both features contribute equally to cluster assignment - -_Select a single answer_ -``` - -+++ - -```{admonition} Question -Which of the following statements correctly describe factors that affect the -stability of k-means clustering across different resampling iterations of the data? - -- a) K-means can produce different results on resampled datasets due to - sensitivity to initialization. -- b) If data is unevenly distributed, the stability improves when increasing the - parameter `n_init` in the "k-means++" initialization. -- c) Stability under resampling is guaranteed after feature scaling. -- d) Increasing the number of clusters always reduces the variability of - results across resamples. - -_Select all answers that apply_ -``` - -+++ - -```{admonition} Question -Which of the following statements correctly describe how WCSS (within-cluster -sum of squares, or inertia) behaves in k-means clustering? - -- a) For a fixed number of clusters, WCSS is lower when clusters are compact. -- b) For a fixed number of clusters, WCSS is lower for wider clusters. -- c) For a fixed number of clusters, lower WCSS implies lower computational cost - during training. -- d) Assuming `n_init` is large enough to ensure convergence, WCSS always - decreases as the number of clusters increases. - -_Select all answers that apply_ -``` - -+++ - -```{admonition} Question -Which of the following statements correctly describe differences between -supervised and unsupervised clustering metrics? - -- a) Supervised clustering metrics such as ARI and AMI require access to ground - truth labels to evaluate clustering performance. -- b) WCSS and the silhouette score evaluate internal cluster structure without - needing reference labels. -- c) V-measure is zero when labels are assigned completely at random. -- d) Supervised clustering metrics are not useful if the number of clusters does - not match the number of predefined classes. - -_Select all answers that apply_ -``` diff --git a/jupyter-book/clustering/clustering_quiz_m4_02.md b/jupyter-book/clustering/clustering_quiz_m4_02.md deleted file mode 100644 index 20edb0115..000000000 --- a/jupyter-book/clustering/clustering_quiz_m4_02.md +++ /dev/null @@ -1,45 +0,0 @@ -# ✅ Quiz M4.02 - -```{admonition} Question -If we increase `min_cluster_size` in HDBSCAN, what happens to the number of -points labeled as noise? - -- a) It decreases. -- b) It increases. -- c) It stays the same. -- d) HDBSCAN fails to converge. - -_Select a single answer_ - -``` - -+++ - -```{admonition} Question -What happens to k-means centroids in the presence of outliers? - -- a) They move towards the outliers assigned to their cluster. -- b) They are not sensitive to outliers. -- c) If a centroid is initialized on an outlier, it may remain isolated in - subsequent iterations. - -_Select all answers that apply_ - -``` - -+++ - -```{admonition} Question -A `KMeans` instance with `n_clusters=10` is used to transform the latitude and -longitude in a supervised learning pipeline. Provided the original dataset consists of -`n_features`, including those two, how many features are passed to -the final estimator of the pipeline? - -- a) `n_features` + 10 -- b) `n_features` + 8 -- c) `n_features` - 2 -- d) `n_features` - -_Select a single answer_ - -``` From 17c6f50ebebc817de5ac31063c9ca4c50b4986be Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 17 Sep 2025 10:36:46 +0200 Subject: [PATCH 4/5] corrected the mistake <3 --- python_scripts/ensemble_hyperparameters.py | 4 ++-- python_scripts/ensemble_sol_04.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index 460854001..bd9a14a58 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -96,7 +96,7 @@ scoring="neg_mean_absolute_error", n_iter=10, random_state=0, - n_jobs=2, + # n_jobs=2, # Uncomment this line if you run locally ) search_cv.fit(data_train, target_train) @@ -184,7 +184,7 @@ scoring="neg_mean_absolute_error", n_iter=20, random_state=0, - n_jobs=2, + # n_jobs=2, # Uncomment this line if you run locally ) search_cv.fit(data_train, target_train) diff --git a/python_scripts/ensemble_sol_04.py b/python_scripts/ensemble_sol_04.py index 071e5877a..96c37bcda 100644 --- a/python_scripts/ensemble_sol_04.py +++ b/python_scripts/ensemble_sol_04.py @@ -70,7 +70,7 @@ cv = KFold(n_splits=5, shuffle=True, random_state=0) results = cross_validate( - search, data, target, cv=cv, return_estimator=True, n_jobs=2 + search, data, target, cv=cv, return_estimator=True, # n_jobs=2 # Uncomment this if you run locally ) # %% [markdown] From 3b3307258db0cd322e8496e327f3ec16740928a0 Mon Sep 17 00:00:00 2001 From: SebastienMelo Date: Wed, 17 Sep 2025 10:39:57 +0200 Subject: [PATCH 5/5] line length fix + double space --- python_scripts/ensemble_hist_gradient_boosting.py | 6 +++--- python_scripts/ensemble_hyperparameters.py | 2 +- python_scripts/ensemble_sol_04.py | 7 ++++++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python_scripts/ensemble_hist_gradient_boosting.py b/python_scripts/ensemble_hist_gradient_boosting.py index 7883366bd..7f0d7b47b 100644 --- a/python_scripts/ensemble_hist_gradient_boosting.py +++ b/python_scripts/ensemble_hist_gradient_boosting.py @@ -58,7 +58,7 @@ data, target, scoring="neg_mean_absolute_error", - # n_jobs=2, # Uncomment this line if you run locally + # n_jobs=2, # Uncomment this line if you run locally ) # %% @@ -122,7 +122,7 @@ data, target, scoring="neg_mean_absolute_error", - # n_jobs=2, # Uncomment this line if you run locally + # n_jobs=2, # Uncomment this line if you run locally ) # %% @@ -161,7 +161,7 @@ data, target, scoring="neg_mean_absolute_error", - # n_jobs=2, # Uncomment this line if you run locally + # n_jobs=2, # Uncomment this line if you run locally ) # %% diff --git a/python_scripts/ensemble_hyperparameters.py b/python_scripts/ensemble_hyperparameters.py index bd9a14a58..c7af37728 100644 --- a/python_scripts/ensemble_hyperparameters.py +++ b/python_scripts/ensemble_hyperparameters.py @@ -96,7 +96,7 @@ scoring="neg_mean_absolute_error", n_iter=10, random_state=0, - # n_jobs=2, # Uncomment this line if you run locally + # n_jobs=2, # Uncomment this line if you run locally ) search_cv.fit(data_train, target_train) diff --git a/python_scripts/ensemble_sol_04.py b/python_scripts/ensemble_sol_04.py index 96c37bcda..b31393077 100644 --- a/python_scripts/ensemble_sol_04.py +++ b/python_scripts/ensemble_sol_04.py @@ -70,7 +70,12 @@ cv = KFold(n_splits=5, shuffle=True, random_state=0) results = cross_validate( - search, data, target, cv=cv, return_estimator=True, # n_jobs=2 # Uncomment this if you run locally + search, + data, + target, + cv=cv, + return_estimator=True, + # n_jobs=2 # Uncomment this if you run locally ) # %% [markdown]