In [22]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial


from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from ISLP.models import \
                (Stepwise,
                sklearn_selected,
                sklearn_selection_path,sklearn_sm)
from l0bnb import fit_path

College = load_data('College')
College = College.dropna()

-- Linear Regression --

In [23]:
College_train, College_valid = skm.train_test_split(College,
                                        test_size=512,
                                        random_state=1)

allvars = College.columns.drop(['Apps'])
design = MS(allvars)

# 2. Linear model
X_train = design.fit_transform(College_train)
y_train = College_train['Apps']
model = sm.OLS(y_train, X_train)
results = model.fit()

X_valid = design.transform(College_valid)
y_valid = College_valid['Apps']
valid_pred = results.predict(X_valid)
#print(np.mean((y_valid - valid_pred)**2))

# Cross validation
hp_model = sklearn_sm(sm.OLS, MS(allvars))
X, Y = College.drop(columns=['Apps']), College['Apps']
cv_results = skm.cross_validate(hp_model,X,Y,cv=College.shape[0])
cv_err = np.mean(cv_results['test_score'])

-- Linear regression test error --

In [30]:
cv_err

1276986.7007954565

-- Ridge Regrssion -- 

In [None]:
design = MS(College.columns.drop('Apps')).fit(College)
Y = np.array(College['Apps'])

D = design.fit_transform(College)
D = D.drop('intercept', axis=1)
X = np.asarray(D)

lambdas = 10**np.linspace(8, -2, 100) / Y.std()

scaler = StandardScaler(with_mean=True, with_std=True)

outer_valid = skm.ShuffleSplit(n_splits=1,
                                test_size=0.23,
                                random_state=1)


inner_cv = skm.KFold(n_splits=5,
                    shuffle=True,
                    random_state=2)


ridgeCV = skl.ElasticNetCV(alphas=lambdas,
                            l1_ratio=0,
                            cv=inner_cv)

pipeCV = Pipeline(steps=[('scaler', scaler),
                            ('ridge', ridgeCV)])

results = skm.cross_validate(pipeCV,
                            X,
                            Y,
                            cv=outer_valid,
                            scoring='neg_mean_squared_error')

-- Ridge Regression test error --

In [31]:
-results['test_score']

array([683056.70168986])

-- Lasso Regression --

In [32]:
design = MS(College.columns.drop('Apps')).fit(College)
Y = np.array(College['Apps'])

D = design.fit_transform(College)
D = D.drop('intercept', axis=1)
X = np.asarray(D)

scaler = StandardScaler(with_mean=True, with_std=True)
lambdas = 10**np.linspace(8, -2, 100) / Y.std()

outer_valid = skm.ShuffleSplit(n_splits=1,
                                test_size=0.25,
                                random_state=1)


inner_cv = skm.KFold(n_splits=5,
                    shuffle=True,
                    random_state=2)

lassoCV = skl.ElasticNetCV(alphas=lambdas,
                            l1_ratio=1,
                            cv=inner_cv)

pipeCV = Pipeline(steps=[('scaler', scaler),
                        ('lasso', lassoCV)])

pipeCV.fit(X, Y)
tuned_lasso = pipeCV.named_steps['lasso']

results = skm.cross_validate(pipeCV,
                            X,
                            Y,
                            cv=outer_valid,
                            scoring='neg_mean_squared_error')


-- Ridge Regression test error --

In [33]:
-results['test_score']

array([683056.70168986])

-- Number of non-zero coefficient estimates --

In [34]:
tuned_lasso.coef_

array([-2.20104552e+02,  3.88449204e+03, -8.17789624e+02,  8.80150745e+02,
       -2.81729194e+02,  2.78187455e+02,  6.76232779e+01, -3.45234195e+02,
        1.65524240e+02,  3.44800431e+00,  2.10466854e+01, -1.41612622e+02,
       -4.90035469e+01,  6.08782188e+01,  2.21254485e+00,  4.06499887e+02,
        1.48794108e+02])

-- PCR model --

In [37]:
kfold = skm.KFold(5,
                random_state=0,
                shuffle=True)


pca = PCA(n_components=2)
linreg = skl.LinearRegression()
pipe = Pipeline([('pca', pca),
                ('linreg', linreg)])
pipe.fit(X, Y)

pipe = Pipeline([('scaler', scaler),
('pca', pca),
('linreg', linreg)])
pipe.fit(X, Y)

param_grid = {'pca__n_components': range(1, 20)}
grid = skm.GridSearchCV(pipe,
                        param_grid,
                        cv=kfold,
                        scoring='neg_mean_squared_error')

Xn = np.zeros((X.shape[0], 1))
cv_null = skm.cross_validate(linreg,
                             Xn,
                             Y,
                             cv=kfold,
                             scoring='neg_mean_squared_error')

-- PCR model test error --

In [None]:
-cv_null['test_score'].mean()