# House Prices - Advanced Regression Techniques

## Importação de Bibliotecas e Dados

#### Importação de métricas para validação de modelo final

In [1]:
import pandas as pd
import numpy as np
from numpy import ravel
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor
from skopt.optimizer import gp_minimize
import plotly.express as px
import plotly.graph_objects as go
from lightgbm import LGBMRegressor

#### Criando dataset na memória

In [2]:
train = r'data/train.csv'
test = r'data/test.csv'
SEED = 42
train = pd.read_csv(train).drop(columns=['Id'])
test = pd.read_csv(test)

### Análise Exploratória

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

Dados Faltantes de Dataset de Treino

In [4]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(train.isnull().sum())

MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea 

Dados Faltantes de Dataset de Teste

In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(test.isnull().sum())

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       227
LotArea             0
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           44
BsmtCond           45
BsmtExposure       44
BsmtFinType1       42
BsmtFinSF1          1
BsmtFinType2       42
BsmtFinSF2          1
BsmtUnfSF           1
TotalBsmtSF         1
Heating             0
HeatingQC           0
CentralAir          0
Electrical          0
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [6]:
fig = px.box(train['SalePrice'], labels={'value': 'Preço', 'variable': 'Variável'})
fig.update_layout(width=350, title='Distribuição de Preço', hovermode="x")

In [7]:
fig = px.imshow(train.corr()[train.corr() > 0.50])
fig.update_layout(title = 'Correlação de Features maior de 0.50')

In [8]:
corr = train.corr()['SalePrice'].sort_values(ascending=False)
corr = corr[corr > 0.50].drop('SalePrice')

In [9]:
fig = px.bar(corr)
fig.update_layout(title = 'Correlação de Features ao SalePrice maior a 0,50')

In [10]:
fig = px.box(train[['OverallQual', 'SalePrice']], x='OverallQual', y='SalePrice')
fig.update_layout(title = 'Correlação de OverallQual ao SalePrice')

In [11]:
fig = px.scatter(train, trendline='ols', x='GrLivArea', y='SalePrice', trendline_color_override='red', hover_data=['OverallQual', 'SaleCondition'])
fig.update_layout(title = 'Correlação de GrLivArea ao SalePrice')

In [12]:
display(f"Dados de Treino: {len(train)}")
display(f"Dados de Teste: {len(test)}")

'Dados de Treino: 1460'

'Dados de Teste: 1459'

### Pré-processamento de Dados

#### Tratamento de Dados, Imputação e Remoção de Linhas com Dados Faltantes nos Dados de Treino

In [13]:
imputer = KNNImputer(missing_values=np.nan)

### PoolQC

In [14]:
#---------------------------------------------------------
train['PoolQC'] = train['PoolQC'].fillna('NA')
test['PoolQC'] = test['PoolQC'].fillna('NA')
#---------------------------------------------------------
train['MiscFeature'] = train['MiscFeature'].fillna('NA')
test['MiscFeature'] = test['MiscFeature'].fillna('NA')
#---------------------------------------------------------
train['Fence'] = train['Fence'].fillna('NA')
test['Fence'] = test['Fence'].fillna('NA')
#---------------------------------------------------------
train['Alley'] = train['Alley'].fillna('NA')
test['Alley'] = test['Alley'].fillna('NA')
#---------------------------------------------------------
train['FireplaceQu'] = train['FireplaceQu'].fillna('NA')
test['FireplaceQu'] = test['FireplaceQu'].fillna('NA')
#---------------------------------------------------------
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
test['MasVnrArea'] = test['MasVnrArea'].fillna(0)
#---------------------------------------------------------
train['MasVnrType'] = train['MasVnrType'].fillna('None')
test['MasVnrType'] = test['MasVnrType'].fillna('None')
#---------------------------------------------------------
train[['GarageType', 'GarageFinish', 'GarageQual','GarageCond']] = train[['GarageType', 'GarageFinish', 'GarageQual','GarageCond']].fillna('NA')
test[['GarageType', 'GarageFinish', 'GarageQual','GarageCond']] = test[['GarageType', 'GarageFinish', 'GarageQual','GarageCond']].fillna('NA')
#---------------------------------------------------------
test[['GarageCars', 'GarageArea']] = test[['GarageCars', 'GarageArea']].fillna(0)
#---------------------------------------------------------
train[['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2']] = train[['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2']].fillna('NA')
test[['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2']] = test[['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2']].fillna('NA')
#---------------------------------------------------------
train['Electrical'] = train['Electrical'].fillna(train['Electrical'].mode()[0])
#---------------------------------------------------------
test['MSZoning'] = test['MSZoning'].fillna(test['MSZoning'].mode()[0])
#---------------------------------------------------------
test['SaleType'] = test['SaleType'].fillna(test['SaleType'].mode()[0])
#---------------------------------------------------------
train['BsmtFinSF1'] = train['BsmtFinSF1'].fillna(0)
test['BsmtFinSF1'] = test['BsmtFinSF1'].fillna(0)
#---------------------------------------------------------
test['Exterior1st'] = test['Exterior1st'].fillna(test['Exterior1st'].mode()[0])
test['Exterior2nd'] = test['Exterior2nd'].fillna(test['Exterior2nd'].mode()[0])
#---------------------------------------------------------
test['BsmtFullBath'] = test['BsmtFullBath'].fillna(test['BsmtFullBath'].mode()[0])
test['BsmtHalfBath'] = test['BsmtHalfBath'].fillna(test['BsmtHalfBath'].mode()[0])
test['HalfBath'] = test['HalfBath'].fillna(test['HalfBath'].mode()[0])
#---------------------------------------------------------
test['BsmtFinSF2'] = test['BsmtFinSF2'].fillna(test['BsmtFinSF2'].mode()[0])
test['BsmtUnfSF'] = test['BsmtUnfSF'].fillna(test['BsmtUnfSF'].mode()[0]) 
test['TotalBsmtSF'] = test['TotalBsmtSF'].fillna(test['TotalBsmtSF'].mode()[0])
#---------------------------------------------------------
test['Functional'] = test['Functional'].fillna(test['Functional'].mode()[0])
test['KitchenQual'] = test['KitchenQual'].fillna(test['KitchenQual'].mode()[0])
#---------------------------------------------------------
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(train['YearBuilt'])
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(test['YearBuilt'])


In [15]:
train = train.drop(columns=['Utilities'])
test = test.drop(columns=['Utilities'])

In [16]:
train[['LotFrontage']] = imputer.fit_transform(train[['LotFrontage']]).ravel()
test[['LotFrontage']] = imputer.fit_transform(test[['LotFrontage']]).ravel()

In [17]:
test.CentralAir = test.CentralAir.map({'N': 0, 'Y': 1})
train.CentralAir = train.CentralAir.map({'N': 0, 'Y': 1})

## Validação de Modelo

In [18]:
features = train.drop(columns=['SalePrice'], axis=1)
labels = train['SalePrice']

##### Adquirir Dummies de Colunas Categóricas

In [19]:
ohe = OneHotEncoder(use_cat_names=True)
ohe_features = ohe.fit_transform(features)


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



##### Escalando Dados

In [20]:
scaler = MinMaxScaler()

In [21]:
scaled_features = scaler.fit_transform(ohe_features)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, labels, random_state=SEED, test_size=0.25)

In [23]:
dummy = DummyRegressor()
dummy.fit(X_train, y_train)
y_pred = dummy.predict(X_test)

In [24]:
display(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')

'RMSE: 83757.52054897288'

##### Tuning Modelo de Light GBM Regressor

In [25]:
reg = LGBMRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [26]:
display(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')

'RMSE: 27419.91932232619'

In [27]:
def treinar_modelo(params):
    SEED = 4

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]

    print(params, '\n')
    
    mdl = LGBMRegressor(random_state=SEED,
    learning_rate = learning_rate, 
    num_leaves = num_leaves, 
    min_child_samples = min_child_samples, 
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    subsample_freq=1,
    n_estimators=n_estimators)
    mdl.fit(X_train, y_train)
    y_pred = mdl.predict(X_test)

    return mean_squared_error(y_test, y_pred, squared=False)

space = [(1e-3, 1e-1, 'log-uniform'), #learning rate
(2, 128), #num_leaves
(1, 100), #min_child_samples
(0.05, 1.0), #subsample
(0.1, 1.0), #colsample_bytree
(100, 1000)] #n_estimators

result = gp_minimize(treinar_modelo, 
space,
random_state=SEED, 
verbose=1,
n_calls = 30)

Iteration No: 1 started. Evaluating function at random point.
[0.03918194347141743, 25, 78, 0.6170076500491628, 0.5012494775682321, 190] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0568
Function value obtained: 34845.3529
Current minimum: 34845.3529
Iteration No: 2 started. Evaluating function at random point.
[0.008288916866885144, 44, 15, 0.6683440493014103, 0.15077042112439026, 750] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.5573
Function value obtained: 26735.6124
Current minimum: 26735.6124
Iteration No: 3 started. Evaluating function at random point.
[0.07535384509295551, 2, 99, 0.6366074341463308, 0.6504878444394528, 106] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0220
Function value obtained: 38222.6698
Current minimum: 26735.6124
Iteration No: 4 started. Evaluating function at random point.
[0.0011120513715710632, 68, 41, 0.09433238005293466, 0.9763799669573134, 309] 

Iteration No: 4 ended. 

In [28]:
learning_rate,num_leaves, min_child_samples, subsample, colsample_bytree, n_estimators  = result.x

In [29]:
model = LGBMRegressor(random_state=SEED,
    learning_rate = learning_rate, 
    num_leaves = num_leaves, 
    min_child_samples = min_child_samples, 
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    subsample_freq=1,
    n_estimators=n_estimators)

In [30]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [31]:
results = cross_val_score(model, X_train, y_train, cv = 10, scoring="neg_root_mean_squared_error")

In [32]:
display(f'RMSE: {results.mean():.2f}')

'RMSE: -27994.83'

In [33]:
fig = go.Figure()

fig.add_traces(go.Scatter(y=y_test, name='Valor Atual'))

fig.add_traces(go.Scatter(y=y_pred, name='Predição', mode='markers'))

fig.update_layout(hovermode='x', title = 'Relação Predição e Valores Atuais', width = 1600, height= 800)


#### Modelo Final

In [52]:
ohe = OneHotEncoder(use_cat_names=True)
ohe_test = ohe.fit_transform(test.drop(columns=['Id']))
diff = set(ohe_features.columns) - set(ohe_test.columns)
for col in diff:
    ohe_test[col] = 0 #Colocando categoria "nula" de data que não há no teste
ohe_test = ohe_test[ohe_features.columns]


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



In [55]:
test_scaled = scaler.fit_transform(ohe_test)
y_pred = model.predict(test_scaled)


In [57]:
len(y_pred)

1459

In [58]:
test['SalePrice'] = y_pred

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,137828.840588
1462,157246.918090
1463,196316.311648
1464,206128.255958
1465,218679.000396
...,...
2915,90171.071974
2916,86584.236248
2917,182968.667242
2918,127558.675842


In [61]:
test[['Id', 'SalePrice']].to_csv(path_or_buf=r'data/results.csv', index=False)