In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# import catboost as cat
from sklearn import linear_model as lm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import SplineTransformer, OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

In [55]:
houses = pd.read_csv("data\\real_estate_train.csv")

In [56]:
print(houses.info())
houses.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [57]:
houses.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

---
# EDA

---
## Target vector

### Treating SalePrice

In [58]:
houses["log_SalePrice"] = np.log(houses["SalePrice"])
houses = houses.drop(columns=["SalePrice"])

---
## Categorical

### Treating MSSubClass

##### As per the documentation, MSSubClass identifies the type of dwelling involved in the sale.

In [59]:
houses["MSSubClass"] = houses["MSSubClass"].astype(str)
houses["MSSubClass"].head()

0    60
1    20
2    60
3    70
4    60
Name: MSSubClass, dtype: object

---
## Treating NAN

In [60]:
houses.select_dtypes("object").isna().sum().sort_values(ascending=False)

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
FireplaceQu       690
GarageType         81
GarageFinish       81
GarageCond         81
GarageQual         81
BsmtFinType2       38
BsmtExposure       38
BsmtCond           37
BsmtFinType1       37
BsmtQual           37
MasVnrType          8
Electrical          1
PavedDrive          0
Functional          0
KitchenQual         0
CentralAir          0
HeatingQC           0
Heating             0
SaleType            0
MSSubClass          0
MSZoning            0
Foundation          0
Street              0
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
ExterQual           0
ExterCond           0
SaleCondition       0
dtype: int64

### Treating Pool

##### NAN values in the "PoolQC" column are due to lack of pool in certain houses.

In [61]:
houses[houses["PoolQC"].isna()][["PoolQC", "PoolArea"]].head(5)

Unnamed: 0,PoolQC,PoolArea
0,,0
1,,0
2,,0
3,,0
4,,0


In [82]:
houses["HasPool"] = houses["PoolArea"].apply(lambda area: "yes" if area == 0 else "no")
houses[houses["PoolQC"].isna()][["PoolQC", "PoolArea", "HasPool"]].nunique()

PoolQC      0
PoolArea    1
HasPool     1
dtype: int64

### Treating MiscFeature

##### NAN values in the "MiscFeature" column are due to lack of additional features in certain houses.

In [63]:
houses["MiscFeature"].value_counts(normalize=False)

Shed    49
Gar2     2
Othr     2
TenC     1
Name: MiscFeature, dtype: int64

In [64]:
houses["MinscFeature"] = houses["MiscFeature"].fillna("None")
houses["MinscFeature"].value_counts()

None    1406
Shed      49
Gar2       2
Othr       2
TenC       1
Name: MinscFeature, dtype: int64

### Treating Alley

##### NAN values in the "Alley" column are due to lack of alleys in certain houses.

In [65]:
houses["Alley"].value_counts()

Grvl    50
Pave    41
Name: Alley, dtype: int64

In [66]:
houses["Alley"] = houses["Alley"].fillna("None")
houses["Alley"].value_counts()

None    1369
Grvl      50
Pave      41
Name: Alley, dtype: int64

### Treating Fence

##### NAN values in the "Fence" column are due to lack of fences in certain houses.

In [67]:
houses["Fence"].value_counts()

MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: Fence, dtype: int64

In [68]:
houses["Fence"] = houses["Fence"].fillna("None")
houses["Fence"].value_counts()

None     1179
MnPrv     157
GdPrv      59
GdWo       54
MnWw       11
Name: Fence, dtype: int64

### Treating FireplaceQu

In [69]:
houses["FireplaceQu"].value_counts()

Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64

In [70]:
houses["FireplaceQu"] = houses["FireplaceQu"].fillna("None")

In [71]:
houses["FireplaceQu"].value_counts()

None    690
Gd      380
TA      313
Fa       33
Ex       24
Po       20
Name: FireplaceQu, dtype: int64

--------
## Pipeline Building

In [91]:
# Make Pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from catboost import CatBoostRegressor
# Categorical variables Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# numerical variables Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [83]:
X = houses.drop(columns=["log_SalePrice"])
y = houses["log_SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

y = houses["log_SalePrice"]

In [84]:
# Cat and num variables
cat_vars = list(X.select_dtypes("object"))
num_vars = list(X.select_dtypes("number"))

In [88]:
# Categorical Pipeline
# Numerical Pipeline
num_imp = SimpleImputer(strategy="median")
sca = StandardScaler()
pca = PCA()
lista_etapas = [("NUM_IMPUTER", num_imp), ("STD_SCALER", sca), ("PCA", pca)]
num_pipeline = Pipeline(lista_etapas)

# Categorical Pipeline
cat_imp = SimpleImputer(strategy="most_frequent")
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', min_frequency=0.1)
cat_pipeline = Pipeline([("CAT_IMPUTER", cat_imp), ('ONE_HOT', ohe)])

# Pipeline de Preprocessamento
pipeline_preprocessamento = [
    ("NUMERICOS", num_pipeline, num_vars),
    ("CATEGORICOS", cat_pipeline, cat_vars),
]
data_prep_pipeline = ColumnTransformer(pipeline_preprocessamento)

catboost = CatBoostRegressor(iterations = 5000, depth = 8, verbose = False)
cb_pipeline = Pipeline([("PREP", data_prep_pipeline), ("CATBOOST_TESTE", catboost)])

cb_pipeline.fit(X_train, y_train)

y_pred = cb_pipeline.predict(X_test)
print(np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred))))

In [98]:
param_grid = {
    "CATBOOST_TESTE__iterations": [1000, 2500, 5000]
}

# Categorical Pipeline
# Numerical Pipeline
num_imp = SimpleImputer(strategy="median")
sca = StandardScaler()
pca = PCA(n_components=5)
lista_etapas = [("NUM_IMPUTER", num_imp), ("STD_SCALER", sca), ("PCA", pca)]
num_pipeline = Pipeline(lista_etapas)

# Categorical Pipeline
cat_imp = SimpleImputer(strategy="most_frequent")
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', min_frequency=0.1)
cat_pipeline = Pipeline([("CAT_IMPUTER", cat_imp), ('ONE_HOT', ohe)])

# Pipeline de Preprocessamento
pipeline_preprocessamento = [
    ("NUMERICOS", num_pipeline, num_vars),
    ("CATEGORICOS", cat_pipeline, cat_vars),
]
data_prep_pipeline = ColumnTransformer(pipeline_preprocessamento)

catboost = CatBoostRegressor(iterations = 5000, depth = 8, verbose = False)
cb_pipeline = Pipeline([("PREP", data_prep_pipeline), ("CATBOOST_TESTE", catboost)])

cb_pipeline.fit(X_train, y_train)

y_pred = cb_pipeline.predict(X_test)
print(np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred))))

39224.6856533055




In [99]:
# Categorical Pipeline
# Numerical Pipeline
num_imp = SimpleImputer(strategy="median")
sca = StandardScaler()
pca = PCA(n_components=5)
lista_etapas = [("NUM_IMPUTER", num_imp), ("STD_SCALER", sca)]
num_pipeline = Pipeline(lista_etapas)

# Categorical Pipeline
cat_imp = SimpleImputer(strategy="most_frequent")
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', min_frequency=0.1)
cat_pipeline = Pipeline([("CAT_IMPUTER", cat_imp), ('ONE_HOT', ohe)])

# Pipeline de Preprocessamento
pipeline_preprocessamento = [
    ("NUMERICOS", num_pipeline, num_vars),
    ("CATEGORICOS", cat_pipeline, cat_vars),
]
data_prep_pipeline = ColumnTransformer(pipeline_preprocessamento)

catboost = CatBoostRegressor(iterations = 5000, depth = 8, verbose = False)
cb_pipeline = Pipeline([("PREP", data_prep_pipeline), ("CATBOOST_TESTE", catboost)])

cb_pipeline.fit(X_train, y_train)

y_pred = cb_pipeline.predict(X_test)
print(np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred))))

32612.736854060488


