# Basic Regression with sklearn

## Setup

In [1]:
# utils
from IPython.display import display

# data manipulation
import numpy as np
import pandas as pd

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# data
from sklearn.datasets import make_regression

# modeling
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

# performance
from sklearn.metrics import mean_squared_error

# set options
pd.set_option("max_columns", 60)

In [2]:
X, y, coef = make_regression(n_samples=1000, n_features=30, n_informative=30, random_state=0, coef=True)

In [3]:
df = pd.DataFrame(X)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,-1.264248,-0.244264,-0.278784,-0.767433,-0.671935,1.078734,0.112192,-0.826115,-1.228228,1.058188,1.02476,2.612479,-0.184346,-0.32024,0.424841,-2.10371,1.379503,-1.327718,-0.401036,0.710128,-0.075117,-0.141831,1.048305,0.57097,-1.631035,-0.131695,0.858342,0.63441,0.995936,0.405763
1,0.938585,-0.374126,0.152177,-1.019251,-1.058572,0.144106,1.239941,-0.509006,0.233458,1.925806,1.004419,-0.547356,-0.172459,-0.359117,0.410602,0.512562,-0.013451,0.792041,-0.63543,0.115665,0.633534,-0.550799,-0.608815,1.951761,-0.726366,0.48097,-0.226122,0.815472,0.85283,0.669562
2,0.313068,0.400157,1.764052,-0.854096,0.864436,1.532779,-0.205158,-0.742165,1.469359,0.333674,0.410599,-1.454366,1.454274,0.653619,1.867558,-2.55299,0.978738,-0.187184,0.950088,2.269755,0.761038,0.045759,0.121675,1.494079,0.144044,-0.977278,0.443863,2.240893,-0.151357,-0.103219
3,-1.389653,0.443729,1.417079,-0.078631,-0.494727,-1.247777,-0.245858,-1.672523,-0.177184,1.206265,-0.367859,-0.67231,-0.220107,-0.207132,-0.437457,0.979513,-0.076973,-0.900317,-2.924153,0.215002,-2.501424,-0.216854,-0.603821,-1.037853,0.137375,1.67401,0.726356,-1.091534,1.68253,-0.147113
4,-0.323417,0.49413,-0.348048,1.228573,1.283534,-0.451845,0.179905,-0.215307,-0.695814,-0.283548,-1.485289,1.064252,0.171403,0.495161,1.133165,-0.249483,-0.842771,0.625803,-1.998865,-1.565542,-0.1498,-0.446577,-1.716757,0.681017,1.16975,0.544585,0.207748,-1.092027,-0.513664,1.566981


In [4]:
df.mean().describe()

count    30.000000
mean     -0.004370
std       0.034647
min      -0.055655
25%      -0.029974
50%      -0.004965
75%       0.012623
max       0.074658
dtype: float64

In [5]:
y[:3]

array([-254.39414185,  290.4027336 ,  765.69313083])

In [6]:
coef

array([94.34866054, 86.47850207, 79.94907627, 46.49917981, 95.9850028 ,
       56.36727307, 25.69357988, 64.95937541, 72.80095115, 73.35527748,
        9.02187519, 17.24602994, 99.20688235, 25.58312342, 43.21101348,
       64.35689422, 62.10721952, 90.73674042, 26.15068806,  5.49210859,
       41.52606921, 53.45292154, 42.88284994, 14.00781566, 40.58265554,
       94.3032678 , 84.70484564, 95.1113458 , 23.39489465, 51.39913928])

In [7]:
mask = coef != 0

In [8]:
display(y[:3], np.matmul(X[:, mask], coef[mask])[:3])

array([-254.39414185,  290.4027336 ,  765.69313083])

array([-254.39414185,  290.4027336 ,  765.69313083])

In [9]:
np.isclose(np.matmul(X[:, mask], coef[mask]), y, rtol=.1).all()

True

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800, 30)

(200, 30)

(800,)

(200,)

In [11]:
pca = PCA(svd_solver="full")
pca_fit = pca.fit(X_train)

In [12]:
n_components = (pca_fit.explained_variance_ratio_.cumsum() < 0.8).sum()

In [13]:
pca = PCA(n_components=n_components, svd_solver="full")
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

display(X_train_pca.shape, X_test_pca.shape)

(800, 22)

(200, 22)

In [61]:
# Set up possible values of parameters to optimize over
p_grid = {"n_estimators": [20, 100, 200],
          "max_depth": [2, 4, 6],
          "learning_rate": [0.001, 0.01, 0.1, 0.5]}

gbr = GradientBoostingRegressor()

inner_cv = KFold(n_splits=5, shuffle=True, random_state=0)

# Non_nested parameter search and scoring
gs = GridSearchCV(estimator=gbr, param_grid=p_grid,
                   cv=inner_cv, scoring="neg_mean_squared_error", 
                  n_jobs=-2, return_train_score=True, 
                  verbose=1)
gs.fit(X_train, y_train)

# Save results of grid search
gs_res = pd.DataFrame(gs.cv_results_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [62]:
gs_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.095665,0.003592,0.001597,0.00174,0.001,2,20,"{'learning_rate': 0.001, 'max_depth': 2, 'n_es...",-136748.534495,-120804.449833,-97459.445646,-113972.774153,-137739.446317,-121344.930089,15041.332123,36,-116664.629304,-120794.774432,-126559.244164,-122412.133657,-116562.490795,-120598.65447,3758.270095
1,0.47514,0.015867,0.001196,0.000399,0.001,2,100,"{'learning_rate': 0.001, 'max_depth': 2, 'n_es...",-134835.677328,-119349.98938,-96378.993351,-112257.557149,-136684.677389,-119901.37892,14944.878996,33,-113901.768767,-117831.530614,-123426.827093,-119373.336824,-113593.917703,-117625.4762,3657.119313
2,0.927157,0.016027,0.001199,0.000403,0.001,2,200,"{'learning_rate': 0.001, 'max_depth': 2, 'n_es...",-132609.883931,-117372.690411,-94691.062343,-110358.823998,-135192.107537,-118044.913644,14907.010744,32,-110738.784473,-114353.185907,-119702.935197,-115867.747394,-110071.872111,-114146.905017,3524.045984
3,0.167152,0.004869,0.0002,0.000399,0.001,4,20,"{'learning_rate': 0.001, 'max_depth': 4, 'n_es...",-136238.256385,-119920.847384,-97047.01555,-113523.550895,-137239.178427,-120793.769728,15010.063589,35,-115600.370541,-119605.373693,-125348.775699,-121190.781689,-115546.477231,-119458.355771,3685.289799
4,0.819609,0.017152,0.001199,0.000397,0.001,4,100,"{'learning_rate': 0.001, 'max_depth': 4, 'n_es...",-132341.839782,-116503.602446,-94352.505513,-109928.309797,-134593.209763,-117543.89346,14876.681059,30,-108636.274504,-112338.124587,-117532.11469,-113712.250684,-108903.069007,-112224.366694,3295.583478


In [63]:
gs_res.iloc[gs_res.mean_test_score.idxmax()]

mean_fit_time                                                   1.180245
std_fit_time                                                    0.004397
mean_score_time                                                 0.001196
std_score_time                                                  0.000398
param_learning_rate                                                  0.5
param_max_depth                                                        2
param_n_estimators                                                   200
params                 {'learning_rate': 0.5, 'max_depth': 2, 'n_esti...
split0_test_score                                          -26185.100038
split1_test_score                                           -26455.28671
split2_test_score                                          -28291.876035
split3_test_score                                          -20899.959445
split4_test_score                                          -33993.567612
mean_test_score                                    

In [59]:
display(gs.best_params_, gs.best_score_, gs.scorer_)

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}

-35286.884219059044

make_scorer(mean_squared_error, greater_is_better=False)

In [60]:
mean_squared_error(y_test, gs.predict(X_test_pca))

47490.07670486927

In [19]:
mean_squared_error(y_test, np.ones_like(y_test)*y.mean())

122236.0734452304

In [50]:
# Set up possible values of parameters to optimize over
p_grid = {"alpha": [1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0],
          "l1_ratio": [0, .1, .5, 1]}

enr = ElasticNet(max_iter=1e4)

inner_cv = KFold(n_splits=10, shuffle=True, random_state=0)

# Non_nested parameter search and scoring
gs = GridSearchCV(estimator=enr, param_grid=p_grid,
                   cv=inner_cv, scoring="neg_mean_squared_error", 
                  n_jobs=-2, return_train_score=True, 
                  verbose=1)
gs.fit(X_train, y_train)

# Save results of grid search
gs_res = pd.DataFrame(gs.cv_results_)

Fitting 10 folds for each of 28 candidates, totalling 280 fits


  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(


In [51]:
gs_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.011572,0.006545,0.000697,0.000456,0.0001,0.0,"{'alpha': 0.0001, 'l1_ratio': 0}",-0.001314929,-0.002103,-0.001192935,-0.001367,-0.000941,-0.001094,-0.001378,-0.00137,-0.001613955,-0.001482,-0.001386,0.0003,9,-0.001234613,-0.001383,-0.001225374,-0.001259,-0.001203,-0.001220051,-0.001294,-0.001253,-0.001253381,-0.001289,-0.001261,4.9e-05
1,0.002094,0.000537,0.000798,0.000399,0.0001,0.1,"{'alpha': 0.0001, 'l1_ratio': 0.1}",-0.001069837,-0.001723,-0.0009696344,-0.001112,-0.000764,-0.00089,-0.001126,-0.001116,-0.001312589,-0.001207,-0.001129,0.000248,8,-0.001003919,-0.001133,-0.0009963895,-0.001026,-0.000978,-0.0009923864,-0.001058,-0.00102,-0.001018923,-0.001053,-0.001028,4.3e-05
2,0.001795,0.000746,0.0005,0.0005,0.0001,0.5,"{'alpha': 0.0001, 'l1_ratio': 0.5}",-0.000342044,-0.000583,-0.0003076801,-0.000356,-0.000241,-0.000285,-0.000374,-0.000361,-0.0004183281,-0.000392,-0.000366,8.8e-05,7,-0.0003195438,-0.000385,-0.0003171295,-0.000334,-0.000312,-0.0003166974,-0.000352,-0.000328,-0.0003236732,-0.000347,-0.000334,2.1e-05
3,0.001296,0.000459,0.000698,0.000457,0.0001,1.0,"{'alpha': 0.0001, 'l1_ratio': 1}",-7.325253e-07,-1.4e-05,-6.533537e-07,-8e-06,-3e-06,-1e-06,-7e-06,-2e-06,-8.787477e-07,-7e-06,-4e-06,4e-06,5,-5.866001e-07,-1.3e-05,-6.947711e-07,-8e-06,-2e-06,-8.825553e-07,-6e-06,-1e-06,-5.484332e-07,-8e-06,-4e-06,4e-06
4,0.297304,0.023467,0.0005,0.0005,0.001,0.0,"{'alpha': 0.001, 'l1_ratio': 0}",-0.1296119,-0.194954,-0.1185317,-0.135898,-0.094787,-0.107699,-0.130469,-0.133043,-0.159982,-0.145122,-0.13501,0.026474,13,-0.1222856,-0.128121,-0.1213923,-0.122721,-0.119117,-0.1205157,-0.122538,-0.12265,-0.1244557,-0.123571,-0.122737,0.002296


In [52]:
display(gs.best_params_, gs.best_score_, gs.scorer_)

{'alpha': 0.0, 'l1_ratio': 0}

-3.6741561717018975e-06

make_scorer(mean_squared_error, greater_is_better=False)

In [53]:
gs_res.iloc[gs_res.mean_test_score.idxmax()]

mean_fit_time                              0.002393
std_fit_time                               0.002647
mean_score_time                            0.000999
std_score_time                             0.000004
param_alpha                                     0.0
param_l1_ratio                                    0
params                {'alpha': 0.0, 'l1_ratio': 0}
split0_test_score                              -0.0
split1_test_score                         -0.000012
split2_test_score                              -0.0
split3_test_score                         -0.000008
split4_test_score                         -0.000003
split5_test_score                         -0.000001
split6_test_score                         -0.000006
split7_test_score                         -0.000001
split8_test_score                              -0.0
split9_test_score                         -0.000006
mean_test_score                           -0.000004
std_test_score                             0.000004
rank_test_sc

In [55]:
mean_squared_error(y_test, gs.predict(X_test))

8.537134574496583e-06

In [25]:
mean_squared_error(y_test, np.ones_like(y_test)*y.mean())

122236.0734452304

In [34]:
display(y_test[:10].round(0), gs.predict(X_test_pca)[:10].round(0))

array([ 387.,   -7.,  -40.,  -34.,  135., -365.,   41., -188.,   10.,
        243.])

array([ 318.,   29., -306.,   -0.,  192., -230.,  158.,   21., -148.,
         69.])

In [26]:
from sklearn.linear_model import LinearRegression

In [30]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [31]:
mean_squared_error(y_test, lr.predict(X_test))

2.0446959118966538e-25

In [64]:
gbr = GradientBoostingRegressor(learning_rate=.01, max_depth=2, n_estimators=1000)
gbr.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=2, n_estimators=1000)

In [65]:
mean_squared_error(y_test, gbr.predict(X_test))

40466.926977231626