In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [3]:
diamonds_df = sns.load_dataset("diamonds")
X = diamonds_df[["carat", "depth", "table", "x", "y", "z"]]
y = diamonds_df["price"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
X_train.sample(5)

Unnamed: 0,carat,depth,table,x,y,z
2317,0.7,62.6,55.0,5.66,5.69,3.55
2607,0.9,60.5,58.0,6.22,6.27,3.78
24034,0.29,62.7,61.0,4.2,4.22,2.64
50715,0.67,62.3,62.0,5.5,5.59,3.46
44802,0.5,61.3,58.0,5.06,5.12,3.12


#### Ridge Regression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [23]:
ridge = Ridge(normalize=True)

In [28]:
hyperparameters = {"alpha":np.linspace(0, 0.03, 10)}

In [29]:
grid = GridSearchCV(ridge, hyperparameters, cv=5, verbose=True, scoring="neg_root_mean_squared_error")

In [30]:
grid.fit(X_train, y_train.values)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


GridSearchCV(cv=5, estimator=Ridge(normalize=True),
             param_grid={'alpha': array([0.        , 0.00333333, 0.00666667, 0.01      , 0.01333333,
       0.01666667, 0.02      , 0.02333333, 0.02666667, 0.03      ])},
             scoring='neg_root_mean_squared_error', verbose=True)

In [31]:
pd.DataFrame(np.abs(grid.cv_results_["mean_test_score"]), columns=["Loss"], index = hyperparameters["alpha"])

Unnamed: 0,Loss
0.0,1546.244133
0.003333,1511.026975
0.006667,1515.21154
0.01,1521.518457
0.013333,1528.44663
0.016667,1535.592196
0.02,1542.740017
0.023333,1549.760959
0.026667,1556.578374
0.03,1563.149323


In [32]:
y_pred = grid.predict(X_test)

In [33]:
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("MAE: ", metrics.mean_absolute_error(y_test, y_pred))

RMSE:  1462.9903772059579
MAE:  906.3943411670763


## Advanced Model

In [34]:
rs = RobustScaler()
poly = PolynomialFeatures()
lasso  = Lasso()

pipe = Pipeline([
    ("poly", poly),
    ("rs",rs),
    ("lasso",lasso)
])

In [35]:
hyperparameter_grid = {
    "poly__degree": [1, 2, 3],
    "lasso__alpha": np.linspace(0,0.5,11)
}

In [36]:
grid = GridSearchCV(pipe, hyperparameter_grid, verbose=True, n_jobs=4,
                   scoring="neg_root_mean_squared_error")

In [37]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 33 candidates, totalling 165 fits


GridSearchCV(estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                       ('rs', RobustScaler()),
                                       ('lasso', Lasso())]),
             n_jobs=4,
             param_grid={'lasso__alpha': array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ]),
                         'poly__degree': [1, 2, 3]},
             scoring='neg_root_mean_squared_error', verbose=True)

In [38]:
grid_df = pd.DataFrame(grid.cv_results_["params"])

In [40]:
grid_df

Unnamed: 0,lasso__alpha,poly__degree
0,0.0,1
1,0.0,2
2,0.0,3
3,0.05,1
4,0.05,2
5,0.05,3
6,0.1,1
7,0.1,2
8,0.1,3
9,0.15,1


In [41]:
grid_df["loss"] = -grid.cv_results_["mean_test_score"]
grid_df

Unnamed: 0,lasso__alpha,poly__degree,loss
0,0.0,1,1544.441247
1,0.0,2,2555.218909
2,0.0,3,7115.220698
3,0.05,1,1542.20872
4,0.05,2,2480.738685
5,0.05,3,6772.066501
6,0.1,1,1540.041891
7,0.1,2,2404.025072
8,0.1,3,6420.602595
9,0.15,1,1537.933589


In [42]:
grid.best_params_

{'lasso__alpha': 0.5, 'poly__degree': 1}