In [1]:
import sklearn, numpy
print("scikit-learn version:", sklearn.__version__)
print("numpy version:", numpy.__version__)

scikit-learn version: 1.7.1
numpy version: 2.1.3


In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier

# Prepare data
from sklearn.datasets import load_iris
import numpy as np
data = load_iris()
X, y = data.data.copy(), data.target
X[0:10, 0] = np.nan  # simulate missing

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 1: Impute
imputer = SimpleImputer(strategy='mean')
try:
    X_imp = imputer.fit_transform(X_train)
    print("Imputer OK, shape:", X_imp.shape)
except Exception as e:
    print("Imputer error:", e)

# Step 2: Scale
scaler = StandardScaler()
try:
    X_std = scaler.fit_transform(X_imp)
    print("Scaler OK, shape:", X_std.shape)
except Exception as e:
    print("Scaler error:", e)

# Step 3: Feature selector
selector = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'))
try:
    selector.fit(X_std, y_train)
    X_sel = selector.transform(X_std)
    print("Selector OK, features selected:", X_sel.shape[1])
except Exception as e:
    print("Selector error:", e)

# Step 4: Classifier
knn = KNeighborsClassifier(n_neighbors=3)
try:
    knn.fit(X_sel, y_train)
    print("Classifier OK")
except Exception as e:
    print("Classifier error:", e)

Imputer OK, shape: (120, 4)
Scaler OK, shape: (120, 4)
Selector OK, features selected: 4
Classifier OK




In [3]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load and simulate missingness
iris = load_iris()
X, y = iris.data.copy(), iris.target
X[0:10, 0] = np.nan

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('feature_selector',
        SelectFromModel(
            LogisticRegression(penalty='l1', solver='liblinear', C=1.0)
        )
    ),
    ('classifier', KNeighborsClassifier(n_neighbors=3))
])

# Fit and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Pipeline accuracy:", accuracy_score(y_test, y_pred))



Pipeline accuracy: 0.9333333333333333


In [None]:
# integrated GridSearchCV into pipeline for hyperparameter tuning of both the L1-based feature selector (via C)
#  and the KNN classifier (n_neighbors) using the Iris dataset with simulated missing values.



import numpy as np
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# 1. Load Iris and simulate missing values
iris = load_iris()
X, y = iris.data.copy(), iris.target
X[:10, 0] = np.nan  # Simulate missing in first column

# 2. Define the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('feature_selection',
        SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'))
    ),
    ('classifier', KNeighborsClassifier())
])

# 3. Set up hyperparameter grid
param_grid = {
    'feature_selection__estimator__C': [0.01, 0.1, 1.0, 10.0],
    'classifier__n_neighbors': [3, 5, 7]
}

# 4. Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# 5. Fit and find the best parameters
grid_search.fit(X, y)

# 6. Results
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", np.round(grid_search.best_score_, 4))

15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\TSHIFHIWA AUSTIN\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\TSHIFHIWA AUSTIN\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\

Best Parameters: {'classifier__n_neighbors': 5, 'feature_selection__estimator__C': 0.1}
Best Cross-Validated Accuracy: 0.9667


 0.96666667 0.96              nan 0.96666667 0.96666667 0.96      ]
