In [1]:
!pip install xgboost --quiet

import numpy as np, pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_wine
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

np.random.seed(42)


In [2]:
data = load_wine(as_frame=True)
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X.shape, X_train.shape, X_test.shape


((178, 13), (142, 13), (36, 13))

In [3]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

baseline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=500, multi_class="multinomial"))
])

scores_acc = cross_val_score(baseline, X, y, cv=cv, scoring="accuracy")
scores_f1  = cross_val_score(baseline, X, y, cv=cv, scoring="f1_macro")

print(f"Baseline Accuracy: {scores_acc.mean():.4f} ± {scores_acc.std():.4f}")
print(f"Baseline F1-macro: {scores_f1.mean():.4f} ± {scores_f1.std():.4f}")




Baseline Accuracy: 0.9833 ± 0.0136
Baseline F1-macro: 0.9829 ± 0.0140


In [4]:
results = []

for k in [3, 5, 7, 9, 11, X.shape[1]]:
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("select", SelectKBest(score_func=mutual_info_classif, k=k)),
        ("clf", LogisticRegression(max_iter=500, multi_class="multinomial"))
    ])
    acc = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy").mean()
    f1  = cross_val_score(pipe, X, y, cv=cv, scoring="f1_macro").mean()
    results.append(("SelectKBest", k, acc, f1))

pd.DataFrame(results, columns=["Method","K","Accuracy","F1_macro"]).sort_values("F1_macro", ascending=False)




Unnamed: 0,Method,K,Accuracy,F1_macro
5,SelectKBest,13,0.983333,0.982905
2,SelectKBest,7,0.977619,0.978385
3,SelectKBest,9,0.97746,0.977861
4,SelectKBest,11,0.972063,0.972578
1,SelectKBest,5,0.943968,0.957477
0,SelectKBest,3,0.949365,0.936778


In [5]:
results_rfe = []

for k in [3, 5, 7, 9, 11]:
    estimator = LogisticRegression(max_iter=500, multi_class="multinomial")
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("rfe", RFE(estimator=estimator, n_features_to_select=k)),
        ("clf", LogisticRegression(max_iter=500, multi_class="multinomial"))
    ])
    acc = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy").mean()
    f1  = cross_val_score(pipe, X, y, cv=cv, scoring="f1_macro").mean()
    results_rfe.append(("RFE_LogReg", k, acc, f1))

pd.DataFrame(results_rfe, columns=["Method","K","Accuracy","F1_macro"]).sort_values("F1_macro", ascending=False)




Unnamed: 0,Method,K,Accuracy,F1_macro
4,RFE_LogReg,11,0.988889,0.988713
3,RFE_LogReg,9,0.988889,0.988713
2,RFE_LogReg,7,0.971905,0.973712
1,RFE_LogReg,5,0.971905,0.97306
0,RFE_LogReg,3,0.93254,0.934473


In [6]:
# L1-regularized Logistic Regression
pipe_l1 = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(penalty="l1", solver="saga", max_iter=1000, multi_class="multinomial"))
])
acc_l1 = cross_val_score(pipe_l1, X, y, cv=cv, scoring="accuracy").mean()
f1_l1  = cross_val_score(pipe_l1, X, y, cv=cv, scoring="f1_macro").mean()
print(f"L1-LogReg  Accuracy={acc_l1:.4f}  F1-macro={f1_l1:.4f}")

# Random Forest Feature Importance
rf = RandomForestClassifier(n_estimators=400, random_state=42)
acc_rf = cross_val_score(rf, X, y, cv=cv, scoring="accuracy").mean()
f1_rf  = cross_val_score(rf, X, y, cv=cv, scoring="f1_macro").mean()
print(f"RandomForest Accuracy={acc_rf:.4f}  F1-macro={f1_rf:.4f}")

# XGBoost Model
xgb = XGBClassifier(n_estimators=400, max_depth=4, learning_rate=0.05,
                    subsample=0.9, colsample_bytree=0.9,
                    eval_metric="mlogloss", random_state=42)
acc_xgb = cross_val_score(xgb, X, y, cv=cv, scoring="accuracy").mean()
f1_xgb  = cross_val_score(xgb, X, y, cv=cv, scoring="f1_macro").mean()
print(f"XGBoost     Accuracy={acc_xgb:.4f}  F1-macro={f1_xgb:.4f}")




L1-LogReg  Accuracy=0.9775  F1-macro=0.9784
RandomForest Accuracy=0.9830  F1-macro=0.9840
XGBoost     Accuracy=0.9776  F1-macro=0.9778


In [7]:
df_kbest = pd.DataFrame(results, columns=["Method","K","Accuracy","F1_macro"])
best_k = int(df_kbest.sort_values("F1_macro", ascending=False).iloc[0]["K"])

final_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("select", SelectKBest(score_func=mutual_info_classif, k=best_k)),
    ("clf", LogisticRegression(max_iter=1000, multi_class="multinomial"))
])

final_model = final_pipe.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

print("Best K from SelectKBest:", best_k)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1-macro:", f1_score(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred, target_names=data.target_names))


Best K from SelectKBest: 13
Test Accuracy: 0.9722222222222222
Test F1-macro: 0.9709618874773139
              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00        12
     class_1       0.93      1.00      0.97        14
     class_2       1.00      0.90      0.95        10

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36





In [8]:
selector = final_model.named_steps["select"]
mask = selector.get_support()
selected_features = X.columns[mask]
scores = selector.scores_[mask]

feat_importance = pd.DataFrame({"feature": selected_features, "score": scores}).sort_values("score", ascending=False)
feat_importance.reset_index(drop=True).head(10)


Unnamed: 0,feature,score
0,flavanoids,0.68218
1,color_intensity,0.612972
2,proline,0.543798
3,od280/od315_of_diluted_wines,0.501474
4,alcohol,0.452803
5,hue,0.432519
6,total_phenols,0.419806
7,malic_acid,0.328711
8,magnesium,0.261513
9,proanthocyanins,0.260724
