In [3]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder

In [4]:
train = pd.read_csv('train (assignment_5).csv')
test  = pd.read_csv('test (assignment_5).csv')
all_data = pd.concat([train.drop('SalePrice', axis=1), test], sort=False)

In [None]:
# Drop useless features
all_data.drop(['Id','Street','Utilities'], axis=1, inplace=True)

In [7]:
# Impute missing values
# Numeric with 0
zeros = ['BsmtFinSF1','BsmtFinSF2','BsmtFullBath','BsmtHalfBath',
         'TotalBsmtSF','MasVnrArea']
all_data[zeros] = all_data[zeros].fillna(0)
# LotFrontage median by Neighborhood
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))
# Categorical fill
for col in all_data.select_dtypes('object'):
    all_data[col] = all_data[col].fillna('None')

In [8]:
# Feature engineering
all_data['HasGarage'] = (all_data['GarageArea'] > 0).astype(int)
all_data['HasFireplace'] = (all_data['Fireplaces'] > 0).astype(int)
all_data['HasPool'] = (all_data['PoolArea'] > 0).astype(int)
all_data['Has2ndFlr'] = (all_data['2ndFlrSF'] > 0).astype(int)
all_data['TotalSF'] = all_data['GrLivArea'] + all_data['TotalBsmtSF']
all_data['TotalBaths'] = (all_data['BsmtFullBath'] + all_data['FullBath'] +
                          0.5*(all_data['BsmtHalfBath'] + all_data['HalfBath']))

In [9]:
# Neighborhood encoding into tiers (0=low,2=high) – customized per data distribution
neigh_mean = train.groupby('Neighborhood')['SalePrice'].mean()
tiers = pd.qcut(neigh_mean, 3, labels=[0,1,2])
all_data['NeighborhoodTier'] = all_data['Neighborhood'].map(tiers)

In [None]:

# Log transform skewed numeric features
numeric_feats = all_data.select_dtypes(include=[np.number]).columns
skewed = all_data[numeric_feats].apply(lambda x: stats.skew(x.dropna())).abs()
skewed_feats = skewed[skewed > 0.5].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [11]:
# Separate target, re-split train/test
y = np.log1p(train['SalePrice'])
X = all_data.iloc[:train.shape[0], :]
X_test = all_data.iloc[train.shape[0]:, :]

In [12]:
# Build preprocessing pipeline
num_cols = X.select_dtypes(include=[np.number]).columns
cat_cols = X.select_dtypes(include=['object']).columns

num_pipeline = Pipeline([
    ('scaler', RobustScaler())
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


In [13]:
# Fit and transform
X_trans = preprocessor.fit_transform(X)
X_test_trans = preprocessor.transform(X_test)