In [1]:
import numpy as np
import pandas as pd

In [89]:
df = pd.read_csv('housing_pp2.csv', index_col=0)
df.shape

(2579, 81)

In [83]:
y = df.SalePrice
X = df.drop(['SalePrice', 'PID'], axis =1).copy()

In [84]:
df2 = pd.read_csv('housing_pp.csv', index_col=0)

In [85]:
y2 = df2.SalePrice
X2 = df2.drop(['SalePrice', 'PID'], axis =1).copy()

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

In [5]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

In [101]:
numerical_columns = numerical_columns_selector(X3)
categorical_columns = categorical_columns_selector(X3)

In [102]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", drop= 'first')
numerical_preprocessor = StandardScaler()

In [103]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42
)

In [15]:
from sklearn import linear_model 
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings('ignore')

In [142]:
lasso = linear_model.Lasso(alpha = 0.0001)
lasso_pipe = pipeline.make_pipeline(preprocessor, lasso)
lasso_regr = compose.TransformedTargetRegressor(regressor= lasso_pipe,
                                                func=np.log, inverse_func=np.exp)

In [143]:
lasso_regr.fit(X_train, y_train)
print('train', lasso_regr.score(X_train, y_train))
print('test', lasso_regr.score(X_test, y_test))

ValueError: A given column is not a column of the dataframe

In [130]:
scores = cross_validate(lasso_regr, X3, y3, cv=10,
                        scoring=('r2'),
                        return_train_score=True)

scores

{'fit_time': array([0.07233953, 0.06416726, 0.07290483, 0.06693125, 0.07445621,
        0.06675601, 0.06865144, 0.07854128, 0.07520986, 0.06830645]),
 'score_time': array([0.00992417, 0.0161407 , 0.02554345, 0.0163126 , 0.01705074,
        0.01834893, 0.02164483, 0.02257133, 0.01697111, 0.01685715]),
 'test_score': array([-0.04146559, -0.05348027, -0.04638392, -0.03457991, -0.06846534,
        -0.06648949, -0.00328911, -0.00502418, -0.02160837, -0.00729967]),
 'train_score': array([-0.03001796, -0.02977234, -0.03000571, -0.0307179 , -0.03092543,
        -0.03076134, -0.03115498, -0.0310065 , -0.03072728, -0.03115376])}

In [106]:
scores['test_score'].mean()

0.8835823195168111

In [179]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits(X)
kf

KFold(n_splits=5, random_state=42, shuffle=True)

In [184]:
scores2 = cross_validate(lasso_regr, X3, y3, cv=kf,
                        scoring=('r2'),
                        return_train_score=True)

scores2

{'fit_time': array([0.27703023, 0.24295402, 0.25887561, 0.26721382, 0.26637721]),
 'score_time': array([0.0166254 , 0.01514053, 0.01735711, 0.01671958, 0.01664615]),
 'test_score': array([0.94777668, 0.94873649, 0.94873863, 0.83342114, 0.9464846 ]),
 'train_score': array([0.95452648, 0.95578283, 0.9556384 , 0.95594767, 0.95572235])}

In [185]:
scores2['test_score'].mean()

0.9250315093472679

In [150]:
df3

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,PoolYN,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,GarageYN
0,909176150,856,126000,30,RL,62.472926,7890,Pave,0.0,1,...,no,0.0,,,0,3,2010,WD,Normal,yes
1,905476230,1049,139500,120,RL,42.000000,4235,Pave,0.0,1,...,no,0.0,,,0,2,2009,WD,Normal,yes
2,911128020,1001,124900,30,C (all),60.000000,6060,Pave,0.0,1,...,no,0.0,,,0,11,2007,WD,Normal,yes
3,535377150,1039,114000,70,RL,80.000000,8146,Pave,0.0,1,...,no,0.0,,,0,5,2009,WD,Normal,yes
4,534177230,1665,227000,60,RL,70.000000,8400,Pave,0.0,1,...,no,0.0,,,0,11,2009,WD,Normal,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,903205040,952,121000,30,RL,70.105866,8854,Pave,0.0,1,...,no,0.0,,,0,5,2009,WD,Normal,yes
2576,905402060,1733,139600,20,RL,108.318076,13680,Pave,0.0,2,...,no,0.0,,,0,6,2009,WD,Normal,yes
2577,909275030,2002,145000,90,RH,82.000000,6270,Pave,0.0,1,...,no,0.0,,,0,8,2007,WD,Normal,yes
2578,907192040,1842,217500,60,RL,69.884162,8826,Pave,0.0,1,...,no,0.0,,,0,7,2007,WD,Normal,yes


# Now try the grid search

In [113]:
from sklearn.pipeline import Pipeline

In [114]:
lasso = linear_model.Lasso()
lasso_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Lasso", lasso)])

In [192]:
alphas = np.linspace(193, , 20)
alphas

array([166.        , 171.47368421, 176.94736842, 182.42105263,
       187.89473684, 193.36842105, 198.84210526, 204.31578947,
       209.78947368, 215.26315789, 220.73684211, 226.21052632,
       231.68421053, 237.15789474, 242.63157895, 248.10526316,
       253.57894737, 259.05263158, 264.52631579, 270.        ])

In [170]:
param_grid = {
    "regressor__Lasso__alpha": [0, .1, .01, .001, .0001, .00001, .000001]
}

In [171]:
lasso_regr = compose.TransformedTargetRegressor(regressor= lasso_pipe,
                                                func=np.log, inverse_func=np.exp)
lasso_regr

In [173]:
from sklearn.model_selection import GridSearchCV
search = GridSearchCV(lasso_regr, param_grid, n_jobs=2, cv = kf)
#search.fit(X3, y3)
search

In [120]:
print(search.best_score_)
print(search.best_params_)

0.8895404334643274
{'regressor__Lasso__alpha': 0.001}


In [86]:
search.fit(X2, y2)

In [87]:
print(search.best_score_)
print(search.best_params_)

0.8834249274007423
{'regressor__Lasso__alpha': 0.01}


In [161]:
df3 = pd.read_csv('housing_partial_abnormal.csv', index_col=0)
df3.shape

(2554, 81)

In [138]:
df4 = pd.read_csv('housing_normal.csv', index_col=0)
df4.shape

(2413, 81)

In [139]:
y4 = df4.SalePrice
X4 = df4.drop(['SalePrice', 'PID'], axis =1).copy()

In [162]:
y3 = df3.SalePrice
X3 = df3.drop(['SalePrice', 'PID'], axis =1).copy()

In [174]:
search.fit(X3, y3)

ValueError: Invalid parameter 'Lasso' for estimator Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standard_scaler',
                                                  StandardScaler(),
                                                  ['GrLivArea', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'Alley', 'LotShape',
                                                   'LandContour', 'Utilities',
                                                   'LandSlope', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'ExterQual', 'ExterCond',
                                                   'BsmtQual', 'BsmtCond',
                                                   'BsmtExposure',
                                                   'BsmtFinT...
                                                                handle_unknown='ignore'),
                                                  ['MSZoning', 'Street',
                                                   'LotConfig', 'Neighborhood',
                                                   'Condition1', 'Condition2',
                                                   'BldgType', 'HouseStyle',
                                                   'RoofStyle', 'RoofMatl',
                                                   'Exterior1st', 'Exterior2nd',
                                                   'MasVnrType', 'Foundation',
                                                   'Heating', 'CentralAir',
                                                   'Electrical', 'GarageType',
                                                   'PoolYN', 'Fence',
                                                   'MiscFeature', 'SaleType',
                                                   'SaleCondition',
                                                   'GarageYN'])])),
                ('lasso', Lasso(alpha=0.0001))]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [166]:
print(search.best_score_)
print(search.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'