In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn import model_selection
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import impute
from sklearn import compose
from sklearn import linear_model
from sklearn import metrics
from sklearn import set_config

# img quality
mpl.rcParams["figure.dpi"] = 150

In [4]:
train = pd.read_csv("./sk_learn_pipes_net_code/data/train.csv")
X_test = pd.read_csv("./sk_learn_pipes_net_code/data/test.csv")

In [3]:
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [6]:
X = train.drop(['SalePrice'], axis=1)
y = train.SalePrice

X_train, x_valid, y_train, y_valid = model_selection.train_test_split(X, y, test_size=.3)

In [8]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1022.0,733.670254,419.523287,1.0,375.25,724.0,1098.25,1460.0
MSSubClass,1022.0,56.981409,42.874354,20.0,20.0,50.0,70.0,190.0
LotFrontage,837.0,70.216249,25.293726,21.0,59.0,70.0,80.0,313.0
LotArea,1022.0,10728.437378,11314.324978,1300.0,7685.75,9501.5,11700.0,215245.0
OverallQual,1022.0,6.08317,1.375571,1.0,5.0,6.0,7.0,10.0
OverallCond,1022.0,5.59002,1.116705,1.0,5.0,5.0,6.0,9.0
YearBuilt,1022.0,1970.963796,29.801755,1872.0,1954.0,1972.0,2000.0,2010.0
YearRemodAdd,1022.0,1984.755382,20.704744,1950.0,1967.0,1993.0,2004.0,2010.0
MasVnrArea,1016.0,104.475394,180.067632,0.0,0.0,0.0,165.25,1378.0
BsmtFinSF1,1022.0,448.036204,462.712739,0.0,0.0,392.0,718.25,5644.0


In [10]:
X_train.describe(include=object).T

Unnamed: 0,count,unique,top,freq
MSZoning,1022,5,RL,805
Street,1022,2,Pave,1016
Alley,60,2,Grvl,31
LotShape,1022,4,Reg,650
LandContour,1022,4,Lvl,915
Utilities,1022,2,AllPub,1021
LotConfig,1022,5,Inside,746
LandSlope,1022,3,Gtl,967
Neighborhood,1022,25,NAmes,167
Condition1,1022,9,Norm,886


In [12]:
above_0_missing = X_train.isnull().sum() > 0

X_train.isnull().sum()[above_0_missing]

LotFrontage      185
Alley            962
MasVnrType         6
MasVnrArea         6
BsmtQual          24
BsmtCond          24
BsmtExposure      24
BsmtFinType1      24
BsmtFinType2      24
Electrical         1
FireplaceQu      492
GarageType        54
GarageYrBlt       54
GarageFinish      54
GarageQual        54
GarageCond        54
PoolQC          1019
Fence            831
MiscFeature      984
dtype: int64

In [19]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [18]:
# let's preprocessing our data with a pipeline
numeric_pipeline = pipeline.Pipeline(
    steps=[
        ('impute', impute.SimpleImputer(strategy='mean')),
        ('scale', preprocessing.MinMaxScaler())
    ]
)

categorical_pipeline = pipeline.Pipeline(
    steps=[
        ('impute', impute.SimpleImputer(strategy='most_frequent')),
        ('one-hot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]
)


sklearn.pipeline.Pipeline class takes a tuple of transformers for its steps argument. Each tuple should have this pattern:

- `('name_of_transformer', transformer)`

These two pipelines are useless if we don't tell which columns they should be applied to. For that, we will use another transformer, `ColumnTransformer`.

In [22]:
full_processor = compose.ColumnTransformer(
    transformers=[ 
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ]
)

Let's combine our preprocessing steps, with a modeling phase too.

In [24]:
lasso = linear_model.Lasso(alpha=0.1)

lasso_pipeline = pipeline.Pipeline(
    steps=[ 
        ('preprocess', full_processor),
        ('model', lasso)  # estimator
    ]
)

In [27]:
set_config(display='diagram')
lasso_pipeline

In [29]:
_ = lasso_pipeline.fit(X_train, y_train)
preds = lasso_pipeline.predict(x_valid)
print(metrics.mean_absolute_error(y_valid, preds))

print(lasso_pipeline.score(x_valid, y_valid))

19032.971981269493
0.8387400317987823


  model = cd_fast.enet_coordinate_descent(


In [32]:
# let's use our pipeline to tune the alpha param
param_dict = {'model__alpha': np.arange(1, 100, 5)}

search = model_selection.GridSearchCV(lasso_pipeline, param_dict, cv=10, scoring="neg_mean_absolute_error")

_ = search.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [33]:
print(f"Best score: {abs(search.best_score_)}")
print(f"Best alpha: {search.best_params_}")

Best score: 17835.994875673427
Best alpha: {'model__alpha': 66}


In [34]:
lasso = linear_model.Lasso(alpha=66)

lasso_pipeline = pipeline.Pipeline(
    steps=[ 
        ('preprocess', full_processor),
        ('model', lasso)  # estimator
    ]
)

In [35]:
_ = lasso_pipeline.fit(X_train, y_train)
preds = lasso_pipeline.predict(x_valid)
print(metrics.mean_absolute_error(y_valid, preds))

print(lasso_pipeline.score(x_valid, y_valid))

16934.97433072547
0.8905280588979911


In [37]:
preds_final = lasso_pipeline.predict(X_test)
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_final})
output.to_csv('./sk_learn_pipes_net_code/data/submission.csv', index=False)