## Import libraries

In [121]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from imblearn.over_sampling import RandomOverSampler


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV



## 1. Data import

In [64]:
df = pd.read_csv('wine-quality.csv')

## 2. Data_cleaning

In [65]:
(
    df
    .drop(['density','free sulfur dioxide'],axis = 1)
    .dropna()
    .drop_duplicates()
)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,total sulfur dioxide,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,170.0,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,132.0,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,97.0,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,186.0,3.19,0.40,9.9,6
6,6.2,0.32,0.16,7.0,0.045,136.0,3.18,0.47,9.6,6
...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,92.0,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,168.0,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,111.0,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,110.0,3.34,0.38,12.8,7


In [66]:
def data_cleaning_function(df):
    df_clean = (
            df
            .drop(['density','free sulfur dioxide'],axis = 1)
            .dropna()
            .drop_duplicates()
        )
    return df_clean
df_clean = data_cleaning_function(df)

### split the clean data

In [67]:

def train_test_val_split_function(df):
    X = df.drop(['quality'], axis=1).copy(deep=True)
    y = df['quality'].copy(deep=True)

    X_train_set, X_test, y_train_set, y_test = train_test_split(X, y, test_size=0.1, stratify= y,random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train_set, y_train_set, test_size=0.3, stratify=y, random_state=23)

    return X_train, X_val, y_train, y_val, X_test, y_test

# Splitting Data and Debugging
X_train, X_val, y_train, y_val, X_test, y_test = train_test_val_split_function(df_clean)

print("Shapes:")
print(f"X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_val: {y_val.shape}, y_test: {y_test.shape}")

Shapes:
X_train: (2494, 9), X_val: (1070, 9), X_test: (397, 9)
y_train: (2494,), y_val: (1070,), y_test: (397,)


## 3. Data transformations

In [68]:
( 
    X_train
)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,total sulfur dioxide,pH,sulphates,alcohol
2379,6.4,0.270,0.19,2.00,0.084,191.0,3.49,0.63,9.6
251,8.5,0.260,0.21,16.20,0.074,197.0,3.02,0.50,9.8
3020,8.4,0.320,0.35,11.70,0.029,46.0,3.02,0.34,11.8
1224,7.2,0.230,0.39,2.30,0.033,102.0,3.26,0.54,12.3
1896,7.5,0.290,0.67,8.10,0.037,166.0,2.90,0.41,8.9
...,...,...,...,...,...,...,...,...,...
2227,7.7,0.270,0.61,12.00,0.046,179.0,3.07,0.46,8.9
2544,6.9,0.320,0.30,1.80,0.036,117.0,3.24,0.48,11.0
1127,6.4,0.125,0.29,5.85,0.042,99.0,3.23,0.32,12.0
4211,7.1,0.380,0.42,11.80,0.041,193.0,3.04,0.49,10.0


### SMOTE

In [109]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,total sulfur dioxide,pH,sulphates,alcohol
2379,6.4,0.27,0.19,2.0,0.084,191.0,3.49,0.63,9.6
251,8.5,0.26,0.21,16.2,0.074,197.0,3.02,0.5,9.8
3020,8.4,0.32,0.35,11.7,0.029,46.0,3.02,0.34,11.8
1224,7.2,0.23,0.39,2.3,0.033,102.0,3.26,0.54,12.3
1896,7.5,0.29,0.67,8.1,0.037,166.0,2.9,0.41,8.9


In [110]:
y_train.head()

2379    4
251     3
3020    6
1224    7
1896    6
Name: quality, dtype: int64

### Random forest regressor

In [123]:

from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Assuming your pipeline setup with ROS and RandomForestRegressor
pipeline = ImbPipeline(steps=[
    ('ros', RandomOverSampler(random_state=42)),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Parameter grid for GridSearchCV
param_grid = {
    'regressor__n_estimators': [200,400],
    'regressor__max_depth': [None, 10]
}

# Example GridSearchCV setup
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Best params and best score
print(f'Best params: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

Best params: {'regressor__max_depth': None, 'regressor__n_estimators': 200}
Best score: 0.24047650674476823


### Elastic net regressor

In [126]:
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import ElasticNet


# Assuming your pipeline setup with ROS and RandomForestRegressor
pipeline = ImbPipeline(steps=[
    ('ros', RandomOverSampler(random_state=42)),
    ('regressor', ElasticNet(random_state=42))
    ])

# Parameter grid for GridSearchCV
# Parameter grid for GridSearchCV
param_grid = {

    'regressor__alpha': [0.1, 1.0,1.3,1.9],
    'regressor__l1_ratio': [0.2,0.4,0.6]
}

# Example GridSearchCV setup
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Best params and best score
print(f'Best params: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

Best params: {'regressor__alpha': 1.0, 'regressor__l1_ratio': 0.4}
Best score: 0.07385624360601642


### Xgb_booster

In [132]:
import numpy as np
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), slice(0, X_train.shape[1]))
    ])

# Create the pipeline
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('feature_selection', SelectKBest(f_regression, k='all')),
    ('smote', SMOTE(random_state=42)),
    ('regressor', XGBRegressor(random_state=42))
])

# Define a wider parameter grid
param_grid = {
    'poly__degree': [1, 2],
    'feature_selection__k': [5, 7, 9, 'all'],
    'smote__k_neighbors': [3, 5, 7],
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 5, 7, None],
    'regressor__learning_rate': [0.01, 0.1, 0.3],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.8, 1.0],
}

# Use RandomizedSearchCV for faster tuning
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_grid, n_iter=100, cv=5, 
    scoring='r2', n_jobs=-1, random_state=42, verbose=2
)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print results
print(f'Best params: {random_search.best_params_}')
print(f'Best score: {random_search.best_score_}')

# Evaluate on validation set
val_score = random_search.score(X_val, y_val)
print(f'Validation R2 score: {val_score}')

# Feature importance
feature_importance = random_search.best_estimator_.named_steps['regressor'].feature_importances_
feature_names = random_search.best_estimator_.named_steps['feature_selection'].get_feature_names_out()

for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True):
    print(f'{name}: {importance}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END feature_selection__k=9, poly__degree=1, regressor__colsample_bytree=0.8, regressor__learning_rate=0.01, regressor__max_depth=7, regressor__n_estimators=200, regressor__subsample=1.0, smote__k_neighbors=7; total time=   0.0s
[CV] END feature_selection__k=9, poly__degree=1, regressor__colsample_bytree=0.8, regressor__learning_rate=0.01, regressor__max_depth=7, regressor__n_estimators=200, regressor__subsample=1.0, smote__k_neighbors=7; total time=   0.0s
[CV] END feature_selection__k=5, poly__degree=1, regressor__colsample_bytree=0.8, regressor__learning_rate=0.01, regressor__max_depth=7, regressor__n_estimators=300, regressor__subsample=1.0, smote__k_neighbors=3; total time=   0.0s
[CV] END feature_selection__k=9, poly__degree=1, regressor__colsample_bytree=0.8, regressor__learning_rate=0.01, regressor__max_depth=7, regressor__n_estimators=200, regressor__subsample=1.0, smote__k_neighbors=7; total time=   0.0s
[CV] 

ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 258, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 1050, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 112, in fit_resample
    output = self._fit_resample(X, y)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/over_sampling/_smote/base.py", line 382, in _fit_resample
    nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/neighbors/_base.py", line 835, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 8, n_samples_fit = 2, n_samples = 2

--------------------------------------------------------------------------------
64 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 258, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 1050, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 112, in fit_resample
    output = self._fit_resample(X, y)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/over_sampling/_smote/base.py", line 382, in _fit_resample
    nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/neighbors/_base.py", line 835, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 8, n_samples_fit = 3, n_samples = 3

--------------------------------------------------------------------------------
84 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 258, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 1050, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 112, in fit_resample
    output = self._fit_resample(X, y)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/over_sampling/_smote/base.py", line 382, in _fit_resample
    nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/neighbors/_base.py", line 835, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 4, n_samples_fit = 2, n_samples = 2

--------------------------------------------------------------------------------
56 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 258, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 1050, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 112, in fit_resample
    output = self._fit_resample(X, y)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/over_sampling/_smote/base.py", line 382, in _fit_resample
    nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/neighbors/_base.py", line 835, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 4, n_samples_fit = 3, n_samples = 3

--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 258, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 1050, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 112, in fit_resample
    output = self._fit_resample(X, y)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/over_sampling/_smote/base.py", line 382, in _fit_resample
    nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/neighbors/_base.py", line 835, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 2, n_samples = 2

--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 258, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 1050, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/base.py", line 112, in fit_resample
    output = self._fit_resample(X, y)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/over_sampling/_smote/base.py", line 382, in _fit_resample
    nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/neighbors/_base.py", line 835, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 3, n_samples = 3


In [131]:
import numpy as np
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), slice(0, X_train.shape[1]))
    ])

# Create the pipeline
pipeline = ImbPipeline(steps=[
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('feature_selection', SelectKBest(f_regression, k='all')),
    ('regressor', XGBRegressor(random_state=42))
])

# Define a wider parameter grid
param_grid = {
    'poly__degree': [1, 2],
    'feature_selection__k': [5, 7, 9, 'all'],
    'regressor__n_estimators': [50,100],
    'regressor__max_depth': [2,3, 5, None],
    'regressor__learning_rate': [0.009, 0.02],
    'regressor__subsample': [0.6, 0.8],
    'regressor__colsample_bytree': [1.0, 1.4],
}

# Use RandomizedSearchCV for faster tuning
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_grid, n_iter=100, cv=5, 
    scoring='r2', n_jobs=-1, random_state=42, verbose=2
)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print results
print(f'Best params: {random_search.best_params_}')
print(f'Best score: {random_search.best_score_}')

# Evaluate on validation set
val_score = random_search.score(X_val, y_val)
print(f'Validation R2 score: {val_score}')

# Feature importance
feature_importance = random_search.best_estimator_.named_steps['regressor'].feature_importances_
feature_names = random_search.best_estimator_.named_steps['feature_selection'].get_feature_names_out()

for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True):
    print(f'{name}: {importance}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END feature_selection__k=9, poly__degree=1, regressor__colsample_bytree=1.4, regressor__learning_rate=0.02, regressor__max_depth=2, regressor__n_estimators=50, regressor__subsample=0.6; total time=   0.0s
[CV] END feature_selection__k=9, poly__degree=1, regressor__colsample_bytree=1.4, regressor__learning_rate=0.02, regressor__max_depth=2, regressor__n_estimators=50, regressor__subsample=0.6; total time=   0.0s
[CV] END feature_selection__k=9, poly__degree=1, regressor__colsample_bytree=1.4, regressor__learning_rate=0.02, regressor__max_depth=2, regressor__n_estimators=50, regressor__subsample=0.6; total time=   0.0s
[CV] END feature_selection__k=9, poly__degree=1, regressor__colsample_bytree=1.4, regressor__learning_rate=0.02, regressor__max_depth=2, regressor__n_estimators=50, regressor__subsample=0.6; total time=   0.0s
[CV] END feature_selection__k=9, poly__degree=1, regressor__colsample_bytree=1.4, regressor__lear

235 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
235 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/imblearn/pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "/opt/anaconda3/lib/python3.11/site-packages/xgboost/cor