## Import Dataset

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Read the data
df = pd.read_csv('train.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
df.dropna(axis=0, subset=['SalePrice'], inplace=True)

X_full =df.copy()
y = X_full.pop('SalePrice')

## Data preprocessing

In [8]:
def clean_data(df):
    # Replace missing values with "Nan" in columns: 'PoolQC', 'FireplaceQu'
    df = df.fillna({'PoolQC': "Nan", 'FireplaceQu': "Nan"})
    # Drop columns: 'MiscVal', 'MiscFeature' and 2 other columns
    df = df.drop(columns=['MiscVal', 'MiscFeature', 'Alley', 'Fence'])
    # Change column type to object for columns: 'YrSold', 'BsmtFullBath' and 6 other columns
    df = df.astype({'YrSold': 'object', 'BsmtFullBath': 'object', 'BsmtHalfBath': 'object', 'FullBath': 'object', 'HalfBath': 'object', 'KitchenAbvGr': 'object', 'GarageCars': 'object', 'Fireplaces': 'object'})
    return df

X_preprocessed = clean_data(X_full)
X_preprocessed.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,Nan,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,Nan,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,Nan,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,Nan,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,Nan,12,2008,WD,Normal


## Feature Engineering

In [9]:
from sklearn.preprocessing import FunctionTransformer

def feature_engineering(df):
  df['HouseAge'] = df['YrSold'] - df['YearBuilt']
  df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
  df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
  

  return df

feature_eng = FunctionTransformer(feature_engineering)

## Model

## Define Pipelines

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

# Base Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor(n_estimators=100, random_state=0)

num_cols = X_preprocessed.select_dtypes(exclude='object').columns
cat_cols = X_preprocessed.select_dtypes(include='object').columns

# Transformers
numerical_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
transform = ColumnTransformer(transformers=[
    ('num', numerical_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols),
],sparse_threshold=0)

#Preprosessor
preprosessor = Pipeline(steps=[
    ('feature_eng', feature_eng),
    ('transform', transform),
    ('PCA', PCA(n_components=0.95)),
])

#pipeline
ModelPipeline = Pipeline(steps=[
    ('preprosessor', preprosessor),
    ('model', model)
])
