Import Libraries

In [2]:
import pandas as pd
from sklearn import datasets
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler,PolynomialFeatures
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.svm import SVR 
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error


Load Data

In [3]:

diabetes = datasets.load_diabetes(as_frame=True)

y = diabetes.target
X = diabetes.data
X


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


Split The Data For Train And Test

In [4]:

from sklearn.model_selection import train_test_split 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=13)
X_train.shape, X_val.shape, y_train.shape, y_val.shape, y_val


((397, 10),
 (45, 10),
 (397,),
 (45,),
 135    272.0
 358     90.0
 194     86.0
 399    232.0
 405    281.0
 151     88.0
 142    235.0
 0      151.0
 169    152.0
 90      98.0
 27      85.0
 71     270.0
 79     113.0
 341    263.0
 51     225.0
 69     178.0
 425    152.0
 240    275.0
 245    125.0
 369    167.0
 35     102.0
 7       63.0
 334     72.0
 196     72.0
 66     150.0
 29     283.0
 269     87.0
 119    200.0
 118    179.0
 170     47.0
 44     259.0
 304    253.0
 30     129.0
 395    258.0
 382    132.0
 125    161.0
 191    178.0
 231    154.0
 61     144.0
 12     179.0
 105     53.0
 407    140.0
 361    182.0
 18      97.0
 152    292.0
 Name: target, dtype: float64)

Building The Pipeline For A Model

In [4]:
GBR_pipeline = Pipeline([('scaler', StandardScaler()),("select", SelectKBest(k=7)),
('logr',TransformedTargetRegressor(regressor=GradientBoostingRegressor(max_depth=1), transformer=MinMaxScaler()))]) 



GBR_pipeline.fit(X_train, y_train)
GBR_y_pred=GBR_pipeline.predict(X_val)


rms = mean_squared_error(y_val, GBR_y_pred, squared=True) 
GBR_pipeline.score(X_val, y_val),rms

(0.39480216429984816, 3151.402308666783)

In [5]:

svr_pipeline=Pipeline(steps=[('standardscaler', StandardScaler()),('svr', SVR(C=20,epsilon=1))])
svr_pipeline.fit(X_train, y_train)
svr_pipeline_pred=svr_pipeline.predict(X_val)
rms = mean_squared_error(y_val, svr_pipeline_pred, squared=True) 
svr_pipeline.score(X_val, y_val),rms

(0.39573618770862806, 3146.538637065293)

Comparing The Models

In [6]:
pipelines=[GBR_pipeline,svr_pipeline]
pipe_dict={0:'logr',1:'svr'} 
for i,model in enumerate(pipelines):
    print("{} test accuracy is: {}".format(pipe_dict[i],model.score(X_val,y_val)))


logr test accuracy is: 0.39480216429984816
svr test accuracy is: 0.39573618770862806


GradientBoostingRegressor için GridSearchCV kullanımı

In [9]:


# Defining estimator 
model = GradientBoostingRegressor()

# Defining param_grid
params = {
    'n_estimators'      : range(100,500,50),
    'max_depth'         : [8, 9, 10, 11, 12],
    'max_features': ['auto'],
    'criterion' :['friedman_mse']
}

gsearch = GridSearchCV(estimator = model, param_grid = params)
gsearch.fit(X_train, y_train)



GridSearchCV(estimator=GradientBoostingRegressor(),
             param_grid={'criterion': ['friedman_mse'],
                         'max_depth': [8, 9, 10, 11, 12],
                         'max_features': ['auto'],
                         'n_estimators': range(100, 500, 50)})

In [10]:
gsearch.score(X_val,y_val)

0.2708488021575739

In [11]:
gsearch.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 450}

Finding Best Model and Parameters with GridSearchCV

Alternative 1

In [10]:
pipeline = Pipeline([
    # handle missing values
    ("imputer", SimpleImputer()),
    # transform the input
    ("scaler", StandardScaler()),
    # select best features
    ("select", SelectKBest()),
    # fit to transformed outputs
    ("model", TransformedTargetRegressor())
])

In [11]:
pipeline_grid = GridSearchCV(pipeline, [
    
    
    {
        "scaler": ["passthrough", StandardScaler(), MinMaxScaler()],
        "select__k": [4, 6, 8, 10],
        "model__regressor": [LinearRegression()],
        "model__transformer": ["passthrough", StandardScaler(), MinMaxScaler()]
    },
    {
        "scaler": ["passthrough", StandardScaler(), MinMaxScaler()],
        "model__regressor": [SVR()],
        "model__regressor__kernel": ["linear", "rbf", "sigmoid"],
        "model__transformer": ["passthrough", StandardScaler(), MinMaxScaler()]
    },
    {
        "scaler": [MinMaxScaler((-1, 1))],
        "select__k": [4, 6, 8, 10],
        "model__regressor": [MLPRegressor()],
        "model__regressor__hidden_layer_sizes": [(30, ), (10, 10, 10), (10, 20, 10)],
        "model__regressor__activation": ["relu", "tanh"],
        "model__regressor__solver": ["adam"],
        "model__regressor__max_iter": [1000],
        "model__transformer": [MinMaxScaler((-1, 1))]
    },
    {
        "scaler": [StandardScaler(), MinMaxScaler()],
        "select__k": [4, 6, 8, 10],
        "model__regressor": [GradientBoostingRegressor()],
        "model__regressor__n_estimators": [50, 100, 150, 200],
        "model__regressor__max_depth": [3, 6, 9, 12],
        "model__transformer": [StandardScaler(), MinMaxScaler()]
    },
], scoring="r2")  # scoring = "neg_mean_squared_error"



In [12]:
pipeline_grid.fit(X_train, y_train)


105 fits failed out of a total of 1715.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
105 fits failed with the following error:
Traceback (most recent call last):
  File "/root/.local/share/virtualenvs/machine-learning-3gRytY6Y/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/.local/share/virtualenvs/machine-learning-3gRytY6Y/lib/python3.9/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/root/.local/share/virtualenvs/machine-learning-3gRytY6Y/lib/python3.9/site-packages/sklearn/compose/_target.py", line 235, in fit
    self._fit_transformer(y_2d)
  File

In [13]:

pipeline_grid.best_estimator_

Alternative 2

In [14]:


model_params={

  'Gradient_Boosting':{

    'model':GradientBoostingRegressor(),

    'params':{
               'n_estimators': [50, 100, 150, 200],
               'max_depth': [3, 6, 9, 12]

}
},
  'Logistic_Regression':{

    'model':LogisticRegression(),

    'params':{
               'C': [1,5,10,20,40],
               

}
},

  'RandomForestRegression':{

    'model':RandomForestRegressor(),

    'params':{
               'n_estimators': [50, 100, 150, 200],
               'max_depth': [3, 6, 9, 12]

}
}
}



In [15]:
scores = []

for model_name, mp in model_params.items():
    gscv =  GridSearchCV(mp['model'], mp['params'])
    gscv.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': gscv.best_score_,
        'best_params': gscv.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df



Unnamed: 0,model,best_score,best_params
0,Gradient_Boosting,0.464967,"{'max_depth': 3, 'n_estimators': 50}"
1,Logistic_Regression,0.012563,{'C': 1}
2,RandomForestRegression,0.461109,"{'max_depth': 6, 'n_estimators': 150}"


In [16]:
gscv.best_params_

{'max_depth': 6, 'n_estimators': 150}

Predict The New Data

In [17]:
new_data = pd.DataFrame({ 'age':0.2 , 'sex': 0.050680, 'bmi':0.011595, 'bp':-0.036656, 's1':0.012191,
       's2':-0.036038, 's3':0.034309, 's4':0.022692, 's5':0.627, 's6':0.0093}, index = [0])


       

In [18]:
result = pipeline_grid.predict(new_data)
result

array([304.07163098])

Polynomial Regression with a Pipeline

In [19]:

poly_reg_model=Pipeline([('scaler2', StandardScaler()),('pca',PCA(n_components=2)),
('poly',PolynomialFeatures(degree=7,include_bias=False)),('regr',LinearRegression())])

poly_reg_model.fit(X_train,y_train)
poly_reg_model_pred=poly_reg_model.predict(X_val)
rms = mean_squared_error(y_val, poly_reg_model_pred, squared=True)
rms


3437.575184446335