In [1]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

#import xgboost as xgb
#import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv(f"train.csv")
X = df.select_dtypes("number").drop("SalePrice", axis=1)
y = df.SalePrice
pipe = make_pipeline(SimpleImputer(), RobustScaler(), LinearRegression())
print(f"The R2 score is: {cross_val_score(pipe, X, y).mean():.4f}")

The R2 score is: 0.7855


In [8]:
num_cols = df.drop(['SalePrice','Id'], axis=1).select_dtypes("number").columns
cat_cols = df.select_dtypes("object").columns

# we instantiate a first Pipeline, that processes our numerical values
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer()),
        ('scaler', RobustScaler())])

# the same we do for categorical data
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
# a ColumnTransformer combines the two created pipelines
# each tranformer gets the proper features according to «num_cols» and «cat_cols»
preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)])

pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LinearRegression())])

X = df.drop(['SalePrice','Id'], axis=1)
y = df.SalePrice
print(f"The R2 score is: {cross_val_score(pipe, X, y).mean():.4f}")

The R2 score is: 0.7944
