In [1]:
import numpy as np
import pandas as pd

In [82]:
df = pd.read_csv('housing_pp2.csv', index_col=0)
df.shape

(2576, 81)

In [83]:
y = df.SalePrice
X = df.drop(['SalePrice', 'PID'], axis =1).copy()

In [84]:
df2 = pd.read_csv('housing_pp.csv', index_col=0)

In [85]:
y2 = df2.SalePrice
X2 = df2.drop(['SalePrice', 'PID'], axis =1).copy()

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

In [5]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

In [8]:
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", drop= 'first')
numerical_preprocessor = StandardScaler()

In [9]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42
)

In [15]:
from sklearn import linear_model 
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings('ignore')

In [28]:
lasso = linear_model.Lasso(alpha = 0.0001)
lasso_pipe = pipeline.make_pipeline(preprocessor, lasso)
lasso_regr = compose.TransformedTargetRegressor(regressor= lasso_pipe,
                                                func=np.log, inverse_func=np.exp)

In [29]:
lasso_regr.fit(X_train, y_train)
print('train', lasso_regr.score(X_train, y_train))
print('test', lasso_regr.score(X_test, y_test))

train 0.9569066006609651
test 0.9326990516350205


In [38]:
scores = cross_validate(lasso_regr, X, y, cv=10,
                        scoring=('r2'),
                        return_train_score=True)

scores

{'fit_time': array([0.34397411, 0.38401699, 0.3893714 , 0.34048295, 0.3630774 ,
        0.43002629, 0.34903765, 0.41570282, 0.38502049, 0.33544588]),
 'score_time': array([0.01615787, 0.01669836, 0.02376103, 0.01820016, 0.01594281,
        0.018435  , 0.0138936 , 0.01502442, 0.0179708 , 0.01643848]),
 'test_score': array([0.94458271, 0.83840902, 0.93355105, 0.95472027, 0.94593148,
        0.94572645, 0.95201745, 0.93532131, 0.94392486, 0.94101715]),
 'train_score': array([0.95350853, 0.95443831, 0.95354274, 0.95311454, 0.95342821,
        0.95445115, 0.95255451, 0.95474214, 0.95425611, 0.95415984])}

In [34]:
scores['test_score'].mean()

0.9335201736995831

In [41]:
kf = KFold(n_splits=10, shuffle=True, random_state=100)
kf.get_n_splits(X)
kf

KFold(n_splits=10, random_state=100, shuffle=True)

In [42]:
scores2 = cross_validate(lasso_regr, X, y, cv=kf,
                        scoring=('r2'),
                        return_train_score=True)

scores2

{'fit_time': array([0.35026765, 0.38734746, 0.37555766, 0.38536716, 0.3623271 ,
        0.3246038 , 0.31889272, 0.33012795, 0.36216426, 0.39217353]),
 'score_time': array([0.01668835, 0.01767159, 0.02613163, 0.01771808, 0.0167942 ,
        0.01921391, 0.01428485, 0.01979065, 0.01670456, 0.01483965]),
 'test_score': array([0.94152665, 0.93700377, 0.82091437, 0.93994981, 0.95642198,
        0.95559753, 0.93788955, 0.92857566, 0.94342085, 0.96104467]),
 'train_score': array([0.9537167 , 0.95461802, 0.95369298, 0.95435425, 0.95281349,
        0.95272707, 0.95521988, 0.95485861, 0.95357391, 0.95231499])}

In [43]:
scores2['test_score'].mean()

0.9322344855549124

# Now try the grid search

In [45]:
from sklearn.pipeline import Pipeline

In [46]:
lasso = linear_model.Lasso()
lasso_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Lasso", lasso)])

In [73]:
alphas = np.linspace(.00001, .001, 100)
alphas

array([1.0e-05, 2.0e-05, 3.0e-05, 4.0e-05, 5.0e-05, 6.0e-05, 7.0e-05,
       8.0e-05, 9.0e-05, 1.0e-04, 1.1e-04, 1.2e-04, 1.3e-04, 1.4e-04,
       1.5e-04, 1.6e-04, 1.7e-04, 1.8e-04, 1.9e-04, 2.0e-04, 2.1e-04,
       2.2e-04, 2.3e-04, 2.4e-04, 2.5e-04, 2.6e-04, 2.7e-04, 2.8e-04,
       2.9e-04, 3.0e-04, 3.1e-04, 3.2e-04, 3.3e-04, 3.4e-04, 3.5e-04,
       3.6e-04, 3.7e-04, 3.8e-04, 3.9e-04, 4.0e-04, 4.1e-04, 4.2e-04,
       4.3e-04, 4.4e-04, 4.5e-04, 4.6e-04, 4.7e-04, 4.8e-04, 4.9e-04,
       5.0e-04, 5.1e-04, 5.2e-04, 5.3e-04, 5.4e-04, 5.5e-04, 5.6e-04,
       5.7e-04, 5.8e-04, 5.9e-04, 6.0e-04, 6.1e-04, 6.2e-04, 6.3e-04,
       6.4e-04, 6.5e-04, 6.6e-04, 6.7e-04, 6.8e-04, 6.9e-04, 7.0e-04,
       7.1e-04, 7.2e-04, 7.3e-04, 7.4e-04, 7.5e-04, 7.6e-04, 7.7e-04,
       7.8e-04, 7.9e-04, 8.0e-04, 8.1e-04, 8.2e-04, 8.3e-04, 8.4e-04,
       8.5e-04, 8.6e-04, 8.7e-04, 8.8e-04, 8.9e-04, 9.0e-04, 9.1e-04,
       9.2e-04, 9.3e-04, 9.4e-04, 9.5e-04, 9.6e-04, 9.7e-04, 9.8e-04,
       9.9e-04, 1.0e

In [77]:
param_grid = {
    "regressor__Lasso__alpha": [0, .1, .01, .001, .0001, .00001, .000001]
}

In [62]:
lasso_regr = compose.TransformedTargetRegressor(regressor= lasso_pipe,
                                                func=np.log, inverse_func=np.exp)
lasso_regr

In [78]:
from sklearn.model_selection import GridSearchCV
search = GridSearchCV(lasso_regr, param_grid, n_jobs=2, cv = 10)
search.fit(X, y)

In [79]:
print(search.best_score_)
print(search.best_params_)

0.9341246803641974
{'regressor__Lasso__alpha': 0.001}


In [86]:
search.fit(X2, y2)

In [87]:
print(search.best_score_)
print(search.best_params_)

0.8834249274007423
{'regressor__Lasso__alpha': 0.01}
