In [1]:
import os
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

# Define gridsearch and load data


In [2]:
data_path = "../datasets/"
X_train = pd.read_csv(os.path.join(data_path,"X_train.csv"))
y_train = pd.read_csv(os.path.join(data_path,"y_train.csv")).values.ravel()

X_test = pd.read_csv(os.path.join(data_path,"X_test.csv"))
y_test = pd.read_csv(os.path.join(data_path,"y_test.csv")).values.ravel()

In [3]:
X_train.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [5]:
X_train.dtypes

carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [4]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,2.04,3,2,3,60.2,58.0,8.24,8.17,4.94
1,0.9,4,3,2,62.9,56.0,6.11,6.17,3.86
2,0.7,4,1,2,62.7,55.0,5.63,5.67,3.54
3,0.54,3,4,6,62.0,57.0,5.25,5.2,3.24
4,0.5,3,0,2,61.2,60.0,5.14,5.09,3.13


## First iteration: Linear regression, Random Forest and XGBoost without Normalization

I choose these 3 models for the following reasons:

1) Linear regression is the most simple and straightfoward betweeen the relationships between price and the diamonds characteristics
2) XGboost and Random forest can adapt to non-linear relationships, are robust to outliers and are known to achieve good prediction power with tabular data, plus provide feature analysis.


In [114]:
pipeline_reg = Pipeline([
    ('feature_selection', SelectFromModel(LinearRegression())),
    ('model', LinearRegression())  
])

pipeline_rf = Pipeline([
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', RandomForestRegressor())
])

pipeline_xgb = Pipeline([
    ('feature_selection', SelectFromModel(XGBRegressor())),
    ('model', XGBRegressor())  
])

param_grid_reg= {
    'feature_selection__estimator': [LinearRegression()],
    'feature_selection__estimator__fit_intercept': [True, False],
    'model__fit_intercept': [True, False]
}

param_grid_rf = {
    'feature_selection__estimator': [RandomForestRegressor()],
    'feature_selection__estimator__max_depth': [None, 5, 10],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 5, 10]
}

param_grid_xgb = {
    'feature_selection__estimator': [XGBRegressor()],
    'feature_selection__estimator__max_depth': [None, 1, 2, 3, 6, 9],
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'feature_selection__estimator__reg_alpha': [0, 0.1, 0.5],
    'feature_selection__estimator__reg_lambda': [0, 0.1, 0.5]
}


In [115]:
grid_search_reg_5 = GridSearchCV(pipeline_reg, param_grid_reg, cv=5, scoring='neg_mean_squared_error')
grid_search_rf_5 = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_xgb_5 = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='neg_mean_squared_error')

In [116]:
grid_search_reg_5.fit(X_train, y_train)
grid_search_rf_5.fit(X_train, y_train)
grid_search_xgb_5.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

In [117]:
models = [grid_search_reg_5, grid_search_rf_5, grid_search_xgb_5]
model_names = ['Linear regression','Random Forest','XGBoost']
i = 0
model_score = []
for model in models:
    predictor = model.best_estimator_
    train_predictions = predictor.predict(X_train)
    test_predictions = predictor.predict(X_test)
    model_score.append({
        'name': model_names[i],
        'iteration': "No normalization, Negative RMSE as score for best model",
        'best_parameters': model.best_params_,        
        'MAPE_train': mean_absolute_percentage_error(y_train, train_predictions),
        'RMSE_train':  np.sqrt(mean_squared_error(y_train.astype(float), train_predictions)),
        'MAE_train': mean_absolute_error(y_train, train_predictions),
        'R2_train': r2_score(y_train, train_predictions),
        'MAPE_test': mean_absolute_percentage_error(y_test, test_predictions),
        'RMSE_test':  np.sqrt(mean_squared_error(y_test, test_predictions)),
        'MAE_test': mean_absolute_error(y_test, test_predictions),
        'R2_test': r2_score(y_test, test_predictions)
    })
    i+=1
model_score

[{'name': 'Linear regression',
  'iteration': 'No normalization, Negative RMSE as score for best model',
  'best_parameters': {'feature_selection__estimator': LinearRegression(),
   'feature_selection__estimator__fit_intercept': False,
   'model__fit_intercept': False},
  'MAPE_train': 0.3289853336409557,
  'RMSE_train': 1511.2631182212017,
  'MAE_train': 953.0958916672677,
  'R2_train': 0.8536215306510724,
  'MAPE_test': 0.32420899137920367,
  'RMSE_test': 1578.0928564115052,
  'MAE_test': 978.8254970352671,
  'R2_test': 0.851039631414336},
 {'name': 'Random Forest',
  'iteration': 'No normalization, Negative RMSE as score for best model',
  'best_parameters': {'feature_selection__estimator': RandomForestRegressor(),
   'feature_selection__estimator__max_depth': 10,
   'model__max_depth': 5,
   'model__n_estimators': 50},
  'MAPE_train': 0.20321107767931754,
  'RMSE_train': 1308.8964451377974,
  'MAE_train': 767.3968273926444,
  'R2_train': 0.8901986675068398,
  'MAPE_test': 0.2177156

In [None]:
metrics = pd.DataFrame(model_score)

## Feature analysis

It will only be made with the first iteration

In [50]:
grid_search_rf_5.best_params_, grid_search_xgb_5.best_params_

({'feature_selection__estimator': RandomForestRegressor(),
  'feature_selection__estimator__max_depth': None,
  'model__max_depth': 5,
  'model__n_estimators': 100},
 {'feature_selection__estimator': XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
  'feature_selection__estimator_

#### RF

In [54]:
rf_model_analysis = RandomForestClassifier(n_estimators=100,max_depth=5, random_state=42)
rf_model_analysis.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [52]:
importances_rf = rf_model_analysis.feature_importances_
feature_names_rf = X_train.columns

# Create a DataFrame to display feature importances
feature_importance_rf_df = pd.DataFrame({'Feature': feature_names_rf, 'Importance': importances_rf})
feature_importance_rf_df = feature_importance_rf_df.sort_values(by='Importance', ascending=False)

# Print or visualize the top N features
print("RandomForest Feature Importances:")
print(feature_importance_rf_df.head(10))  # Display top 10 features

RandomForest Feature Importances:
   Feature  Importance
0    carat    0.184980
7        y    0.172042
6        x    0.171058
8        z    0.106201
2    color    0.093004
3  clarity    0.092104
4    depth    0.087895
5    table    0.066521
1      cut    0.026194


In [53]:
fig = go.Figure(go.Bar(
    x=feature_importance_rf_df['Importance'][:10],
    y=feature_importance_rf_df['Feature'][:10],
    orientation='h'
))

# Customize layout
fig.update_layout(
    title='RandomForest Feature Importances',
    xaxis_title='Importance',
    yaxis_title='Feature',
    yaxis_categoryorder = 'total ascending'  # Reverse the order of features
)

# Show plot
fig.show()

#### XGB

In [61]:
xgb_model_analysis = XGBRegressor(n_estimators=50, learning_rate=0.1, max_depth = 9, reg_alpha = 0, reg_lambda = 0.1 , random_state=42)
xgb_model_analysis.fit(X_train, y_train)

In [62]:
importances_xgb = xgb_model_analysis.feature_importances_
feature_names_xgb = X_train.columns

# Create a DataFrame to display feature importances
feature_importance_xgb_df = pd.DataFrame({'Feature': feature_names_xgb, 'Importance': importances_xgb})
feature_importance_xgb_df = feature_importance_xgb_df.sort_values(by='Importance', ascending=False)

# Print or visualize the top N features
print("RandomForest Feature Importances:")
print(feature_importance_xgb_df.head(10))  # Display top 10 features

RandomForest Feature Importances:
   Feature  Importance
0    carat    0.700336
7        y    0.162835
3  clarity    0.066117
2    color    0.037341
6        x    0.010616
8        z    0.009474
1      cut    0.005417
4    depth    0.004354
5    table    0.003511


In [64]:
fig = go.Figure(go.Bar(
    x=feature_importance_xgb_df['Importance'][:10],
    y=feature_importance_xgb_df['Feature'][:10],
    orientation='h'
))

# Customize layout
fig.update_layout(
    title='XGB Feature Importances',
    xaxis_title='Importance',
    yaxis_title='Feature',
    yaxis_categoryorder = 'total ascending'  # Reverse the order of features
)

# Show plot
fig.show()

# Second iteration: change the score to select best model

In [65]:
pipeline_reg = Pipeline([
    ('feature_selection', SelectFromModel(LinearRegression())),
    ('model', LinearRegression())  
])

pipeline_rf = Pipeline([
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', RandomForestRegressor())
])

pipeline_xgb = Pipeline([
    ('feature_selection', SelectFromModel(XGBRegressor())),
    ('model', XGBRegressor())  
])

param_grid_reg= {
    'feature_selection__estimator': [LinearRegression()],
    'feature_selection__estimator__fit_intercept': [True, False],
    'model__fit_intercept': [True, False]
}

param_grid_rf = {
    'feature_selection__estimator': [RandomForestRegressor()],
    'feature_selection__estimator__max_depth': [None, 5, 10],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 5, 10]
}

param_grid_xgb = {
    'feature_selection__estimator': [XGBRegressor()],
    'feature_selection__estimator__max_depth': [None, 1, 2, 3, 6, 9],
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'feature_selection__estimator__reg_alpha': [0, 0.1, 0.5],
    'feature_selection__estimator__reg_lambda': [0, 0.1, 0.5]
}


In [66]:
grid_search_reg_5 = GridSearchCV(pipeline_reg, param_grid_reg, cv=5, scoring='neg_mean_absolute_error')
grid_search_rf_5 = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='neg_mean_absolute_error')
grid_search_xgb_5 = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='neg_mean_absolute_error')

In [67]:
grid_search_reg_5.fit(X_train, y_train)
grid_search_rf_5.fit(X_train, y_train)
grid_search_xgb_5.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

In [69]:
models = [grid_search_reg_5, grid_search_rf_5, grid_search_xgb_5]
model_names = ['Linear regression','Random Forest','XGBoost']
i = 0
model_score = []
for model in models:
    predictor = model.best_estimator_
    train_predictions = predictor.predict(X_train)
    test_predictions = predictor.predict(X_test)
    model_score.append({
        'name': model_names[i],
        'iteration': "No normalization, Negative MAE as score for best model",
        'best_parameters': model.best_params_,        
        'MAPE_train': mean_absolute_percentage_error(y_train, train_predictions),
        'RMSE_train':  np.sqrt(mean_squared_error(y_train.astype(float), train_predictions)),
        'MAE_train': mean_absolute_error(y_train, train_predictions),
        'R2_train': r2_score(y_train, train_predictions),
        'MAPE_test': mean_absolute_percentage_error(y_test, test_predictions),
        'RMSE_test':  np.sqrt(mean_squared_error(y_test, test_predictions)),
        'MAE_test': mean_absolute_error(y_test, test_predictions),
        'R2_test': r2_score(y_test, test_predictions)
    })
    i+=1

aux = pd.DataFrame(model_score)

In [74]:
metrics = pd.concat([metrics, aux]).reset_index(drop=True)
metrics

Unnamed: 0,name,iteration,best_parameters,MAPE_train,RMSE_train,MAE_train,R2_train,MAPE_test,RMSE_test,MAE_test,R2_test
0,Linear regression,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': LinearRegress...,0.328985,1511.263118,953.095892,0.853622,0.324209,1578.092856,978.825497,0.85104
1,Random Forest,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': RandomForestR...,0.203758,1308.946761,768.586595,0.89019,0.218341,1540.539696,874.428326,0.858045
2,XGBoost,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
3,Linear regression,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': LinearRegress...,0.292532,1508.546527,922.305318,0.854147,0.289468,1568.435688,948.014313,0.852857
4,Random Forest,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': RandomForestR...,0.203515,1309.482802,767.748521,0.8901,0.218155,1539.515061,873.192321,0.858234
5,XGBoost,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487


# Third iteration: Normalize input

In [75]:
from sklearn.preprocessing import QuantileTransformer, StandardScaler

### Standard Scaler


In [84]:
pipeline_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectFromModel(LinearRegression())),
    ('model', LinearRegression())  
])

pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', RandomForestRegressor())
])

pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectFromModel(XGBRegressor())),
    ('model', XGBRegressor())  
])

param_grid_reg= {
    'feature_selection__estimator': [LinearRegression()],
    'feature_selection__estimator__fit_intercept': [True, False],
    'model__fit_intercept': [True, False]
}

param_grid_rf = {
    'feature_selection__estimator': [RandomForestRegressor()],
    'feature_selection__estimator__max_depth': [None, 5, 10],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 5, 10]
}

param_grid_xgb = {
    'feature_selection__estimator': [XGBRegressor()],
    'feature_selection__estimator__max_depth': [None, 1, 2, 3, 6, 9],
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'feature_selection__estimator__reg_alpha': [0, 0.1, 0.5],
    'feature_selection__estimator__reg_lambda': [0, 0.1, 0.5]
}

In [85]:
grid_search_reg_5 = GridSearchCV(pipeline_reg, param_grid_reg, cv=5, scoring='neg_mean_absolute_error')
grid_search_rf_5 = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='neg_mean_absolute_error')
grid_search_xgb_5 = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='neg_mean_absolute_error')

In [86]:
grid_search_reg_5.fit(X_train, y_train)
grid_search_rf_5.fit(X_train, y_train)
grid_search_xgb_5.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

In [87]:
models = [grid_search_reg_5, grid_search_rf_5, grid_search_xgb_5]
model_names = ['Linear regression','Random Forest','XGBoost']
i = 0
model_score = []
for model in models:
    predictor = model.best_estimator_
    train_predictions = predictor.predict(X_train)
    test_predictions = predictor.predict(X_test)
    model_score.append({
        'name': model_names[i],
        'iteration': "Standard Normalization, Negative MAE as score for best model",
        'best_parameters': model.best_params_,        
        'MAPE_train': mean_absolute_percentage_error(y_train, train_predictions),
        'RMSE_train':  np.sqrt(mean_squared_error(y_train.astype(float), train_predictions)),
        'MAE_train': mean_absolute_error(y_train, train_predictions),
        'R2_train': r2_score(y_train, train_predictions),
        'MAPE_test': mean_absolute_percentage_error(y_test, test_predictions),
        'RMSE_test':  np.sqrt(mean_squared_error(y_test, test_predictions)),
        'MAE_test': mean_absolute_error(y_test, test_predictions),
        'R2_test': r2_score(y_test, test_predictions)
    })
    i+=1

aux = pd.DataFrame(model_score)


In [88]:
metrics

Unnamed: 0,name,iteration,best_parameters,MAPE_train,RMSE_train,MAE_train,R2_train,MAPE_test,RMSE_test,MAE_test,R2_test
0,Linear regression,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': LinearRegress...,0.328985,1511.263118,953.095892,0.853622,0.324209,1578.092856,978.825497,0.85104
1,Random Forest,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': RandomForestR...,0.203758,1308.946761,768.586595,0.89019,0.218341,1540.539696,874.428326,0.858045
2,XGBoost,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
3,Linear regression,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': LinearRegress...,0.292532,1508.546527,922.305318,0.854147,0.289468,1568.435688,948.014313,0.852857
4,Random Forest,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': RandomForestR...,0.203515,1309.482802,767.748521,0.8901,0.218155,1539.515061,873.192321,0.858234
5,XGBoost,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487


In [89]:
metrics = pd.concat([metrics, aux]).reset_index(drop=True)
metrics

Unnamed: 0,name,iteration,best_parameters,MAPE_train,RMSE_train,MAE_train,R2_train,MAPE_test,RMSE_test,MAE_test,R2_test
0,Linear regression,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': LinearRegress...,0.328985,1511.263118,953.095892,0.853622,0.324209,1578.092856,978.825497,0.85104
1,Random Forest,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': RandomForestR...,0.203758,1308.946761,768.586595,0.89019,0.218341,1540.539696,874.428326,0.858045
2,XGBoost,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
3,Linear regression,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': LinearRegress...,0.292532,1508.546527,922.305318,0.854147,0.289468,1568.435688,948.014313,0.852857
4,Random Forest,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': RandomForestR...,0.203515,1309.482802,767.748521,0.8901,0.218155,1539.515061,873.192321,0.858234
5,XGBoost,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
6,Linear regression,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': LinearRegress...,0.30868,1512.08446,938.883855,0.853462,0.298414,1566.270032,952.970628,0.853263
7,Random Forest,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': RandomForestR...,0.203312,1310.092883,767.669435,0.889998,0.217971,1540.685143,873.267207,0.858018
8,XGBoost,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487


### Quantile Tranformation

In [90]:
pipeline_reg = Pipeline([
    ('scaler', QuantileTransformer(random_state=42, output_distribution='normal')),
    ('feature_selection', SelectFromModel(LinearRegression())),
    ('model', LinearRegression())  
])

pipeline_rf = Pipeline([
    ('scaler', QuantileTransformer(random_state=42, output_distribution='normal')),
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', RandomForestRegressor())
])

pipeline_xgb = Pipeline([
    ('scaler', QuantileTransformer(random_state=42, output_distribution='normal')),
    ('feature_selection', SelectFromModel(XGBRegressor())),
    ('model', XGBRegressor())  
])

param_grid_reg= {
    'feature_selection__estimator': [LinearRegression()],
    'feature_selection__estimator__fit_intercept': [True, False],
    'model__fit_intercept': [True, False]
}

param_grid_rf = {
    'feature_selection__estimator': [RandomForestRegressor()],
    'feature_selection__estimator__max_depth': [None, 5, 10],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 5, 10]
}

param_grid_xgb = {
    'feature_selection__estimator': [XGBRegressor()],
    'feature_selection__estimator__max_depth': [None, 1, 2, 3, 6, 9],
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'feature_selection__estimator__reg_alpha': [0, 0.1, 0.5],
    'feature_selection__estimator__reg_lambda': [0, 0.1, 0.5]
}

In [91]:
grid_search_reg_5 = GridSearchCV(pipeline_reg, param_grid_reg, cv=5, scoring='neg_mean_absolute_error')
grid_search_rf_5 = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='neg_mean_absolute_error')
grid_search_xgb_5 = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='neg_mean_absolute_error')

In [92]:
grid_search_reg_5.fit(X_train, y_train)
grid_search_rf_5.fit(X_train, y_train)
grid_search_xgb_5.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

In [93]:
models = [grid_search_reg_5, grid_search_rf_5, grid_search_xgb_5]
model_names = ['Linear regression','Random Forest','XGBoost']
i = 0
model_score = []
for model in models:
    predictor = model.best_estimator_
    train_predictions = predictor.predict(X_train)
    test_predictions = predictor.predict(X_test)
    model_score.append({
        'name': model_names[i],
        'iteration': "Quantile Normalization, Negative MAE as score for best model",
        'best_parameters': model.best_params_,        
        'MAPE_train': mean_absolute_percentage_error(y_train, train_predictions),
        'RMSE_train':  np.sqrt(mean_squared_error(y_train.astype(float), train_predictions)),
        'MAE_train': mean_absolute_error(y_train, train_predictions),
        'R2_train': r2_score(y_train, train_predictions),
        'MAPE_test': mean_absolute_percentage_error(y_test, test_predictions),
        'RMSE_test':  np.sqrt(mean_squared_error(y_test, test_predictions)),
        'MAE_test': mean_absolute_error(y_test, test_predictions),
        'R2_test': r2_score(y_test, test_predictions)
    })
    i+=1

aux = pd.DataFrame(model_score)

In [94]:
metrics = pd.concat([metrics, aux]).reset_index(drop=True)
metrics

Unnamed: 0,name,iteration,best_parameters,MAPE_train,RMSE_train,MAE_train,R2_train,MAPE_test,RMSE_test,MAE_test,R2_test
0,Linear regression,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': LinearRegress...,0.328985,1511.263118,953.095892,0.853622,0.324209,1578.092856,978.825497,0.85104
1,Random Forest,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': RandomForestR...,0.203758,1308.946761,768.586595,0.89019,0.218341,1540.539696,874.428326,0.858045
2,XGBoost,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
3,Linear regression,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': LinearRegress...,0.292532,1508.546527,922.305318,0.854147,0.289468,1568.435688,948.014313,0.852857
4,Random Forest,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': RandomForestR...,0.203515,1309.482802,767.748521,0.8901,0.218155,1539.515061,873.192321,0.858234
5,XGBoost,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
6,Linear regression,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': LinearRegress...,0.30868,1512.08446,938.883855,0.853462,0.298414,1566.270032,952.970628,0.853263
7,Random Forest,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': RandomForestR...,0.203312,1310.092883,767.669435,0.889998,0.217971,1540.685143,873.267207,0.858018
8,XGBoost,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
9,Linear regression,"Quantile Normalization, Negative MAE as score ...",{'feature_selection__estimator': LinearRegress...,0.920922,2120.931629,1575.253891,0.711696,0.903987,2117.777272,1594.924167,0.731734


In [96]:
metrics.sort_values(by=['R2_test'],ascending=False)

Unnamed: 0,name,iteration,best_parameters,MAPE_train,RMSE_train,MAE_train,R2_train,MAPE_test,RMSE_test,MAE_test,R2_test
2,XGBoost,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
5,XGBoost,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
8,XGBoost,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
11,XGBoost,"Quantile Normalization, Negative MAE as score ...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
10,Random Forest,"Quantile Normalization, Negative MAE as score ...",{'feature_selection__estimator': RandomForestR...,0.20348,1308.792935,767.890131,0.890216,0.21796,1536.961143,872.850278,0.858703
4,Random Forest,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': RandomForestR...,0.203515,1309.482802,767.748521,0.8901,0.218155,1539.515061,873.192321,0.858234
1,Random Forest,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': RandomForestR...,0.203758,1308.946761,768.586595,0.89019,0.218341,1540.539696,874.428326,0.858045
7,Random Forest,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': RandomForestR...,0.203312,1310.092883,767.669435,0.889998,0.217971,1540.685143,873.267207,0.858018
6,Linear regression,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': LinearRegress...,0.30868,1512.08446,938.883855,0.853462,0.298414,1566.270032,952.970628,0.853263
3,Linear regression,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': LinearRegress...,0.292532,1508.546527,922.305318,0.854147,0.289468,1568.435688,948.014313,0.852857


# Fourth iteration: Only use best features

In [97]:
pipeline_reg = Pipeline([
    ('scaler', QuantileTransformer(random_state=42, output_distribution='normal')),
    ('feature_selection', SelectFromModel(LinearRegression())),
    ('model', LinearRegression())  
])

pipeline_rf = Pipeline([
    ('scaler', QuantileTransformer(random_state=42, output_distribution='normal')),
    ('feature_selection', SelectFromModel(RandomForestRegressor())),
    ('model', RandomForestRegressor())
])

pipeline_xgb = Pipeline([
    ('scaler', QuantileTransformer(random_state=42, output_distribution='normal')),
    ('feature_selection', SelectFromModel(XGBRegressor())),
    ('model', XGBRegressor())  
])

param_grid_reg= {
    'feature_selection__estimator': [LinearRegression()],
    'feature_selection__estimator__fit_intercept': [True, False],
    'model__fit_intercept': [True, False]
}

param_grid_rf = {
    'feature_selection__estimator': [RandomForestRegressor()],
    'feature_selection__estimator__max_depth': [None, 5, 10],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 5, 10]
}

param_grid_xgb = {
    'feature_selection__estimator': [XGBRegressor()],
    'feature_selection__estimator__max_depth': [None, 1, 2, 3, 6, 9],
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'feature_selection__estimator__reg_alpha': [0, 0.1, 0.5],
    'feature_selection__estimator__reg_lambda': [0, 0.1, 0.5]
}

In [98]:
grid_search_reg_5 = GridSearchCV(pipeline_reg, param_grid_reg, cv=5, scoring='neg_mean_absolute_error')
grid_search_rf_5 = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='neg_mean_absolute_error')
grid_search_xgb_5 = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, scoring='neg_mean_absolute_error')

In [99]:
X_train.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [112]:
grid_search_reg_5.fit(X_train[['carat', 'y', 'x','z']], y_train)
grid_search_rf_5.fit(X_train[['carat', 'y', 'x','z']], y_train)
grid_search_xgb_5.fit(X_train[['carat', 'y', 'x','z']], y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

In [113]:
models = [grid_search_reg_5, grid_search_rf_5, grid_search_xgb_5]
model_names = ['Linear regression','Random Forest','XGBoost']
i = 0
model_score = []
for model in models:
    predictor = model.best_estimator_
    train_predictions = predictor.predict(X_train[['carat', 'y', 'x','z']])
    test_predictions = predictor.predict(X_test[['carat', 'y', 'x','z']])
    model_score.append({
        'name': model_names[i],
        'iteration': "Best features, Quantile Normalization, Negative MAE as score for best model",
        'best_parameters': model.best_params_,        
        'MAPE_train': mean_absolute_percentage_error(y_train, train_predictions),
        'RMSE_train':  np.sqrt(mean_squared_error(y_train.astype(float), train_predictions)),
        'MAE_train': mean_absolute_error(y_train, train_predictions),
        'R2_train': r2_score(y_train, train_predictions),
        'MAPE_test': mean_absolute_percentage_error(y_test, test_predictions),
        'RMSE_test':  np.sqrt(mean_squared_error(y_test, test_predictions)),
        'MAE_test': mean_absolute_error(y_test, test_predictions),
        'R2_test': r2_score(y_test, test_predictions)
    })
    i+=1

aux = pd.DataFrame(model_score)
aux

Unnamed: 0,name,iteration,best_parameters,MAPE_train,RMSE_train,MAE_train,R2_train,MAPE_test,RMSE_test,MAE_test,R2_test
0,Linear regression,"Best features, Quantile Normalization, Negativ...",{'feature_selection__estimator': LinearRegress...,0.931776,2135.407268,1582.638407,0.707747,0.904987,2128.835068,1599.381937,0.728925
1,Random Forest,"Best features, Quantile Normalization, Negativ...",{'feature_selection__estimator': RandomForestR...,0.209001,1362.998529,793.990603,0.880934,0.223427,1558.703163,893.447572,0.854678
2,XGBoost,"Best features, Quantile Normalization, Negativ...",{'feature_selection__estimator': XGBRegressor(...,0.194881,1181.587351,699.959058,0.910519,0.219235,1582.370724,881.165193,0.850231


In [111]:
metrics = pd.concat([metrics, aux]).reset_index(drop=True)
metrics.sort_values(by=['R2_test'],ascending=False)

Unnamed: 0,name,iteration,best_parameters,MAPE_train,RMSE_train,MAE_train,R2_train,MAPE_test,RMSE_test,MAE_test,R2_test
2,XGBoost,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
5,XGBoost,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
8,XGBoost,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
11,XGBoost,"Quantile Normalization, Negative MAE as score ...",{'feature_selection__estimator': XGBRegressor(...,0.131714,804.14136,462.875985,0.958556,0.149803,1108.612609,593.908177,0.926487
16,Random Forest,"Best features, Quantile Normalization, Negativ...",{'feature_selection__estimator': RandomForestR...,0.203533,1308.690504,768.058685,0.890233,0.218198,1535.793958,872.866289,0.858918
10,Random Forest,"Quantile Normalization, Negative MAE as score ...",{'feature_selection__estimator': RandomForestR...,0.20348,1308.792935,767.890131,0.890216,0.21796,1536.961143,872.850278,0.858703
13,Random Forest,"Best features, Quantile Normalization, Negativ...",{'feature_selection__estimator': RandomForestR...,0.203308,1308.29229,767.526279,0.8903,0.218116,1537.796019,873.567228,0.85855
4,Random Forest,"No normalization, Negative MAE as score for be...",{'feature_selection__estimator': RandomForestR...,0.203515,1309.482802,767.748521,0.8901,0.218155,1539.515061,873.192321,0.858234
1,Random Forest,"No normalization, Negative RMSE as score for b...",{'feature_selection__estimator': RandomForestR...,0.203758,1308.946761,768.586595,0.89019,0.218341,1540.539696,874.428326,0.858045
7,Random Forest,"Standard Normalization, Negative MAE as score ...",{'feature_selection__estimator': RandomForestR...,0.203312,1310.092883,767.669435,0.889998,0.217971,1540.685143,873.267207,0.858018


In [108]:
metrics.to_csv("./results_best_models.csv")

# Save model

In [118]:
grid_search_xgb_5.best_params_

{'feature_selection__estimator': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 'feature_selection__estimator__max_depth': 9,
 'feature_selection__estimator__reg_alpha': 0,
 'feature_selection__estimator__reg_lambda': 0.1,
 'model__learning_rate': 0.1,
 'model__n_estimators': 50}

In [120]:
model_score[2]

{'name': 'XGBoost',
 'iteration': 'No normalization, Negative RMSE as score for best model',
 'best_parameters': {'feature_selection__estimator': XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
  'feature_selection__estimator__max_depth': 9,
  'feature_selection__estimator__reg_a

In [121]:
predictor = grid_search_xgb_5.best_estimator_

In [122]:
import joblib

joblib.dump(predictor, '../model_files/xgb_model.pkl')

['../model_files/xgb_model.pkl']

Aportan info las caracteristicas, no mejora porque puede ser la particion