In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.metrics import  f1_score, make_scorer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import TomekLinks

f1_scorer = make_scorer(f1_score, pos_label=1)

DF_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
RANDOM_SEED = 2025

df = pd.read_csv(DF_URL, sep=";")
df = df.drop_duplicates()

df["target"] = np.where(df["quality"] < 7, 0, 1)

y = df["target"]
X = df.drop(["quality", "target"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, train_size=.7, random_state=RANDOM_SEED)

params = {'knn__n_neighbors': 25,
 'knn__weights': 'uniform',
 'oversampler__sampling_strategy': 0.4,
 'oversampler__shrinkage': 2}


column_selector = ColumnTransformer(
    transformers=[
        ('select_best_2', StandardScaler(), ["volatile acidity", "sulphates", "alcohol"]),
    ]
)

model = Pipeline([
    ("column_selector", column_selector),
    ("unskew", PowerTransformer()),
    ("pca", PCA()),
    ("undersampler", TomekLinks()),
    ("oversampler", RandomOverSampler(random_state=RANDOM_SEED)),
    ("knn", KNeighborsClassifier())
])

model.set_params(**params)
model.fit(X_train, y_train)

f1_in_sample = f1_scorer(model, X_train, y_train)
f1_oo_sample = f1_scorer(model, X_test, y_test)

print(f"Model's out of sample f1 score: {f1_oo_sample}")
print(f"Model's in sample f1 score: {f1_in_sample}")

Model's out of sample f1 score: 0.54421768707483
Model's in sample f1 score: 0.580441640378549
