In [30]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('modified.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,2.128232,0.530628,0.0,1.064711,0.07325,2.484907,3.555348,0.9978,3.51,0.444686,2.341806,5
1,2.174752,0.631272,0.0,1.280934,0.09349,3.258097,4.219508,0.9968,3.2,0.518794,2.379546,5
2,2.174752,0.565314,0.04,1.193922,0.088011,2.772589,4.007333,0.997,3.26,0.500775,2.379546,5
3,2.501436,0.24686,0.56,1.064711,0.072321,2.890372,4.110874,0.998,3.16,0.457425,2.379546,6
4,2.128232,0.506818,0.0,1.029619,0.072321,2.639057,3.713572,0.9978,3.51,0.444686,2.341806,5


In [31]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PowerTransformer, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures

X_all = df.drop(columns=["quality"]).copy()
y_all = df["quality"].copy()
X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=2, include_bias=False)

poly.fit(X_tr)

all_poly_features = poly.get_feature_names_out(X_tr.columns)
max_poly_features = len(all_poly_features) 
select_k = min(60, max(10, int(0.4 * max_poly_features)))

print("Number of polynomial features:", max_poly_features)
print("Selecting k =", select_k)



Number of polynomial features: 77
Selecting k = 30


In [32]:
pipe = Pipeline([
    ("power", PowerTransformer(method="yeo-johnson", standardize=False)),
    ("scale", RobustScaler()),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("select", SelectKBest(score_func=f_regression, k=select_k)),
    ("hgb", HistGradientBoostingRegressor(
        max_depth=6,
        learning_rate=0.07,
        max_iter=500,
        l2_regularization=0.01,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42
    )),
])

param_grid = {
    "hgb__max_depth": [4, 6, 8],
    "hgb__learning_rate": [0.03, 0.07, 0.12],
    "hgb__l2_regularization": [0.0, 0.01, 0.1],
}

search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="r2",
    cv=5,
    n_jobs=-1,
    verbose=0,
)

search.fit(X_tr, y_tr)
y_pred_te = search.predict(X_te)

r2 = r2_score(y_te, y_pred_te)
mse = mean_squared_error(y_te, y_pred_te)
mae = mean_absolute_error(y_te, y_pred_te)

print(f"Best params: {search.best_params_}")
print(f"Test R2: {r2:.4f}")
print(f"Test MSE: {mse:.4f}")
print(f"Test MAE: {mae:.4f}")

Best params: {'hgb__l2_regularization': 0.0, 'hgb__learning_rate': 0.12, 'hgb__max_depth': 6}
Test R2: 0.4451
Test MSE: 0.3931
Test MAE: 0.4713
