# House Prices - Advanced Regression Techniques

## Importação de Bibliotecas e Dados

#### Importação de biblioteca para relatório prévio sobre dataset de treino a fim de análise exploratória

In [1]:
from pandas_profiling import ProfileReport

#### Tratando e importação de dataset

In [2]:
import pandas as pd
import numpy as np
from numpy import ravel

#### Importação de modelos para pré-processamento

In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder
from sklearn.model_selection import train_test_split

#### Importação de métricas para validação de modelo final

In [4]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.dummy import DummyRegressor
from skopt.optimizer import dummy_minimize

#### Importação de biblioteca para Data Visualization

In [5]:
import plotly.express as px
import plotly.graph_objects as go

#### Importação de algoritmo de Machine Learning para Regressão

In [6]:
from lightgbm import LGBMRegressor

#### Criando dataset na memória

In [7]:
TRAIN_PATH = r'..\dataset\train.csv'

In [8]:
TEST_PATH = r'..\dataset\test.csv'

In [9]:
SEED = 4

In [10]:
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)
df_train.drop(columns='Id',inplace=True)

### Análise Exploratória

In [11]:
#profile = ProfileReport(df_train, minimal=True)
#profile.to_file(output_file='df_train_report.html')

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

Dados Faltantes de Dataset de Treino

In [13]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df_train.isnull().sum())

MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea 

Dados Faltantes de Dataset de Teste

In [14]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df_test.isnull().sum())

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       227
LotArea             0
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           44
BsmtCond           45
BsmtExposure       44
BsmtFinType1       42
BsmtFinSF1          1
BsmtFinType2       42
BsmtFinSF2          1
BsmtUnfSF           1
TotalBsmtSF         1
Heating             0
HeatingQC           0
CentralAir          0
Electrical          0
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [15]:
fig = px.box(df_train['SalePrice'], labels={'value': 'Preço', 'variable': 'Variável'})
fig.update_layout(width=350, title='Distribuição de Preço', hovermode="x")

In [16]:
fig = px.imshow(df_train.corr()[df_train.corr() >= 0.90])
fig.update_layout(title = 'Correlação de Features com correlação maior igual a 0,90')

In [17]:
display(f"Dados de Treino: {len(df_train[['SalePrice']][df_train['SalePrice'] <= 340000])}")

'Dados de Treino: 1399'

In [18]:
display(f"Outliers: {len(df_train[['SalePrice']][df_train['SalePrice'] > 340000])}")

'Outliers: 61'

### Pré-processamento de Dados

#### Remoção de Outliers

In [19]:
df_train = df_train[:][df_train['SalePrice'] <= 340000]

In [20]:
df_train[:][df_train['SalePrice'] > 340000]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


In [21]:
df_test.CentralAir = df_test.CentralAir.map({'N': 0, 'Y': 1})
df_train.CentralAir = df_train.CentralAir.map({'N': 0, 'Y': 1})

#### Tratamento de Dados, Imputação e Remoção de Linhas com Dados Faltantes nos Dados de Treino

In [22]:
imputer = KNNImputer(missing_values=np.nan)
df_train[['LotFrontage']] = imputer.fit_transform(df_train[['LotFrontage']]).ravel()

In [23]:
df_train.Alley.fillna('NA', inplace=True)
df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(0)
df_train['MasVnrType'] = df_train['MasVnrType'].fillna('None')
df_train[['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1','BsmtFinType2']] = df_train[['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1','BsmtFinType2']].fillna('NA')
df_train.FireplaceQu = df_train.FireplaceQu.fillna('NA')
df_train[['GarageType',
       'GarageYrBlt', 
       'GarageFinish', 
       'GarageCars', 
       'GarageArea', 
       'GarageQual',
       'GarageCond']] = df_train[['GarageType',
       'GarageYrBlt', 
       'GarageFinish', 
       'GarageCars', 
       'GarageArea', 
       'GarageQual',
       'GarageCond']].fillna('NA')
df_train[['PoolQC','Fence', 'MiscFeature']] = df_train[['PoolQC','Fence', 'MiscFeature']].fillna('NA')

In [24]:
df_train.dropna(inplace=True)

#### Tratamento de Dados, Imputação e Remoção de Linhas com Dados Faltantes nos Dados de Teste

In [25]:
imputer = KNNImputer(missing_values=np.nan)
df_test[['LotFrontage']] = imputer.fit_transform(df_test[['LotFrontage']]).ravel()

In [26]:
df_test.Alley.fillna('NA', inplace=True)
df_test['MasVnrArea'] = df_train['MasVnrArea'].fillna(0)
df_test['MasVnrType'] = df_train['MasVnrType'].fillna('None')
df_test['BsmtFinSF1'].fillna(0, inplace=True)
df_test[['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2']] = df_test[['BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].fillna('NA')
df_test.FireplaceQu = df_test.FireplaceQu.fillna('NA')
df_test[['GarageType',
       'GarageYrBlt', 
       'GarageFinish', 
       'GarageCars', 
       'GarageArea', 
       'GarageQual',
       'GarageCond']] = df_train[['GarageType',
       'GarageYrBlt', 
       'GarageFinish', 
       'GarageCars', 
       'GarageArea', 
       'GarageQual',
       'GarageCond']].fillna('NA')
df_test[['PoolQC','Fence', 'MiscFeature']] = df_test[['PoolQC','Fence', 'MiscFeature']].fillna('NA')
df_test[['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2']] = df_test[['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2']].fillna(0)

In [27]:
df_test.dropna(inplace=True)

## Validação de Modelo

In [28]:
features = df_train.drop(columns=['SalePrice'], axis=1)
labels = df_train['SalePrice']

##### Adquirir Dummies de Colunas Categóricas

In [29]:
ohe = OneHotEncoder(use_cat_names=True)
features_ohe = ohe.fit_transform(features)


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



##### Escalando Dados

In [30]:
from sklearn.preprocessing import MinMaxScaler

In [31]:
scaler = MinMaxScaler()

In [32]:
features_scaled = scaler.fit_transform(features_ohe)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, random_state=SEED, test_size=0.25)

In [34]:
dummy = DummyRegressor()
dummy.fit(X_train, y_train)
y_pred = dummy.predict(X_test)

In [35]:
display(f'R² de Modelo Dummy: {round(r2_score(y_test, y_pred)*100,2)}%')

'R² de Modelo Dummy: -0.13%'

##### Tuning Modelo de Light GBM Regressor

In [36]:
def treinar_modelo(params):
    SEED = 4

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]

    print(params, '\n')
    
    mdl = LGBMRegressor(random_state=SEED,
    learning_rate = learning_rate, 
    num_leaves = num_leaves, 
    min_child_samples = min_child_samples, 
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    subsample_freq=1,
    n_estimators=n_estimators)
    mdl.fit(X_train, y_train)
    y_pred = mdl.predict(X_test)

    return -r2_score(y_test, y_pred)

space = [(1e-3, 1e-1, 'log-uniform'), #learning rate
(2, 128), #num_leaves
(1, 100), #min_child_samples
(0.05, 1.0), #subsample
(0.1, 1.0), #colsample_bytree
(100, 1000)] #n_estimators

result = dummy_minimize(treinar_modelo, 
space,
random_state=SEED, 
verbose=1,
n_calls = 30)

Iteration No: 1 started. Evaluating function at random point.
[0.06327656730105531, 71, 2, 0.8628399009188652, 0.6481320413049753, 493] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 1.1990
Function value obtained: -0.9084
Current minimum: -0.9084
Iteration No: 2 started. Evaluating function at random point.
[0.0027050730096210155, 105, 95, 0.7134992241139206, 0.9128606713660984, 776] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1621
Function value obtained: -0.7942
Current minimum: -0.9084
Iteration No: 3 started. Evaluating function at random point.
[0.03620477422893411, 40, 53, 0.9842306433165472, 0.24745801726422886, 249] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1272
Function value obtained: -0.9076
Current minimum: -0.9084
Iteration No: 4 started. Evaluating function at random point.
[0.042062540164342105, 127, 57, 0.0919520550349246, 0.9609876709428125, 748] 

Iteration No: 4 ended. Evaluation done a

In [37]:
learning_rate,num_leaves, min_child_samples, subsample, colsample_bytree, n_estimators  = result.x

In [38]:
model = LGBMRegressor(random_state=SEED,
    learning_rate = learning_rate, 
    num_leaves = num_leaves, 
    min_child_samples = min_child_samples, 
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    subsample_freq=1,
    n_estimators=n_estimators)

In [39]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [40]:
kfold = KFold(n_splits = 10, random_state = SEED, shuffle=True)
results = cross_validate(model, 
                         X_train, 
                         y_train, 
                         cv = kfold, 
                         return_train_score=False,
                        scoring='r2')
cv = round(results['test_score'].mean(), 2)

In [41]:
display(f'Taxa Média de Acerto de Modelo Light GBM: {cv * 100:.2f} %')

'Taxa Média de Acerto de Modelo Light GBM: 89.00 %'

In [42]:
fig = go.Figure()

fig.add_traces(go.Scatter(y=y_test, name='Valor Atual'))

fig.add_traces(go.Scatter(y=y_pred, name='Predição', mode='markers'))

fig.update_layout(hovermode='x', title = 'Relação Predição e Valores Atuais', width = 1600, height= 800)


###### Modelo Final

In [43]:
X_train = df_train.drop('SalePrice', axis=1)
y_train = df_train['SalePrice']

In [44]:
ohe = OneHotEncoder(use_cat_names=True)

In [45]:
ohe_train = ohe.fit_transform(X_train)
ohe_test = ohe.fit_transform(df_test.drop(columns='Id'))


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



In [46]:
colunas_faltantes_test = set(ohe_train.columns) - set(ohe_test.columns)
ohe_train = ohe_train.drop(columns=list(colunas_faltantes_test))

In [47]:
train_scaled = scaler.fit_transform(ohe_train)
test_scaled = scaler.fit_transform(ohe_test)

In [48]:
model = LGBMRegressor(random_state=SEED,
    learning_rate = learning_rate, 
    num_leaves = num_leaves, 
    min_child_samples = min_child_samples, 
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    subsample_freq=1,
    n_estimators=n_estimators)

In [49]:
model.fit(train_scaled, y_train)
y_pred = model.predict(test_scaled)

In [50]:
df_test['PredictedPrices'] = y_pred.round(2)

In [65]:
df_test.to_csv(path_or_buf=r'..\csv_results\test_com_previsões.csv', index=False)