In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

import plotly.graph_objects as go

# data

In [2]:
random_state = 42

data = pd.read_csv('housing.csv')
data.rename(columns={"Price":"target"}, inplace=True)
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,target,Address
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1.260617e+06,USS Barnett\nFPO AP 44820
4,59982.19723,5.040555,7.839388,4.23,26354.10947,6.309435e+05,USNS Raymond\nFPO AE 09386
...,...,...,...,...,...,...,...
4995,60567.94414,7.830362,6.137356,3.46,22837.36103,1.060194e+06,USNS Williams\nFPO AP 30153-7653
4996,78491.27543,6.999135,6.576763,4.02,25616.11549,1.482618e+06,"PSC 9258, Box 8489\nAPO AA 42991-3352"
4997,63390.68689,7.250591,4.805081,2.13,33266.14549,1.030730e+06,"4215 Tracy Garden Suite 076\nJoshualand, VA 01..."
4998,68001.33124,5.534388,7.130144,5.44,42625.62016,1.198657e+06,USS Wallace\nFPO AE 73316


## Address v1

In [3]:
def split_address(address):  # sourcery skip: use-named-expression
    # Padrao de endereco: '208 Michael Ferry Apt. 674\nLaurabury, NE 37010-5101'
    match = re.match(r'(\d+)\s(.*Apt\.\s\d+)\n(.*),\s(.*\s\d{5}-\d{4})', address)
    if match:
        return dict(zip(['house_number', 'street_name', 'city_name', 'state_zip'], match.groups()))
    
    # Padrao de endereco: '188 Johnson Views Suite 079\nLake Kathleen, CA...'
    match = re.match(r'(\d+)\s(.*Suite\s\d+)\n(.*),\s(.*)', address)
    if match:
        return dict(zip(['house_number', 'street_name', 'city_name', 'state_zip'], match.groups()))

    # Padrao de endereco: '9127 Elizabeth Stravenue\nDanieltown, WI 06482...'
    match = re.match(r'(\d+)\s(.*?)\n(.*),\s(.*\s\d{5}-\d{4})', address)
    if match:
        return dict(zip(['house_number', 'street_name', 'city_name', 'state_zip'], match.groups()))

    # Padrao de endereco: 'USS Barnett\nFPO AP 44820'
    match = re.match(r'(.*?)\n(.*?)', address)
    if match:
        return dict(zip(['street_name', 'state_zip'], match.groups()))

    return None

data['Address'].map(split_address).apply(pd.Series)

Unnamed: 0,house_number,street_name,city_name,state_zip
0,208,Michael Ferry Apt. 674,Laurabury,NE 37010-5101
1,188,Johnson Views Suite 079,Lake Kathleen,CA 48958
2,9127,Elizabeth Stravenue,Danieltown,WI 06482-3489
3,,USS Barnett,,
4,,USNS Raymond,,
...,...,...,...,...
4995,,USNS Williams,,
4996,,"PSC 9258, Box 8489",,
4997,4215,Tracy Garden Suite 076,Joshualand,VA 01707-9165
4998,,USS Wallace,,


Acho que vai dar muito trabalho separar direito esses endereços da forma correta, por conterem muitos padrões. Vou tentar outra abordagem.

## Address v2

In [4]:
def classify_address(address):
    # Padrao de endereco: '208 Michael Ferry Apt. 674\nLaurabury, NE 37010-5101'
    match = re.match(r'(\d+)\s(.*Apt\.\s\d+)\n(.*),\s(.*\s\d{5}-\d{4})', address)
    if match:
        return 'Residencial'
    
    # Padrao de endereco: '208 Michael Ferry Apt. 674\nLaurabury, NE 37010-5101'
    match = re.match(r'(\d+)\s(.*Apt\.\s\d+)\n(.*),\s(.*\s\d{5}-\d{4})', address)
    if match:
        return 'Residencial'
    
    # Padrao de endereco: '9127 Elizabeth Stravenue\nDanieltown, WI 06482...'
    match = re.match(r'(\d+)\s(.*?)\n(.*),\s(.*\s\d{5}-\d{4})', address)
    if match:
        return 'Residencial'
    
    # Padrao de endereco: '188 Johnson Views Suite 079\nLake Kathleen, CA...'
    match = re.match(r'(\d+)\s(.*Suite\s\d+)\n(.*),\s(.*)', address)
    if match:
        return 'Comercial'
    
    # Padrao de endereco: 'USS Barnett\nFPO AP 44820'
    match = re.match(r'(.*?)\n(.*?)', address)
    if match:
        return 'Militar'
    
    return 'Outros'

data['Address_Type'] = data['Address'].map(classify_address)
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,target,Address,Address_Type
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",Residencial
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA...",Comercial
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",Residencial
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1.260617e+06,USS Barnett\nFPO AP 44820,Militar
4,59982.19723,5.040555,7.839388,4.23,26354.10947,6.309435e+05,USNS Raymond\nFPO AE 09386,Militar
...,...,...,...,...,...,...,...,...
4995,60567.94414,7.830362,6.137356,3.46,22837.36103,1.060194e+06,USNS Williams\nFPO AP 30153-7653,Militar
4996,78491.27543,6.999135,6.576763,4.02,25616.11549,1.482618e+06,"PSC 9258, Box 8489\nAPO AA 42991-3352",Militar
4997,63390.68689,7.250591,4.805081,2.13,33266.14549,1.030730e+06,"4215 Tracy Garden Suite 076\nJoshualand, VA 01...",Residencial
4998,68001.33124,5.534388,7.130144,5.44,42625.62016,1.198657e+06,USS Wallace\nFPO AE 73316,Militar


In [5]:
data.drop(columns=["Address"], inplace=True)
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,target,Address_Type
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1.059034e+06,Residencial
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1.505891e+06,Comercial
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1.058988e+06,Residencial
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1.260617e+06,Militar
4,59982.19723,5.040555,7.839388,4.23,26354.10947,6.309435e+05,Militar
...,...,...,...,...,...,...,...
4995,60567.94414,7.830362,6.137356,3.46,22837.36103,1.060194e+06,Militar
4996,78491.27543,6.999135,6.576763,4.02,25616.11549,1.482618e+06,Militar
4997,63390.68689,7.250591,4.805081,2.13,33266.14549,1.030730e+06,Residencial
4998,68001.33124,5.534388,7.130144,5.44,42625.62016,1.198657e+06,Militar


In [6]:
# one hot encoding
data = pd.get_dummies(data, columns=['Address_Type'])
data

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,target,Address_Type_Comercial,Address_Type_Militar,Address_Type_Residencial
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1.059034e+06,0,0,1
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1.505891e+06,1,0,0
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1.058988e+06,0,0,1
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1.260617e+06,0,1,0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,6.309435e+05,0,1,0
...,...,...,...,...,...,...,...,...,...
4995,60567.94414,7.830362,6.137356,3.46,22837.36103,1.060194e+06,0,1,0
4996,78491.27543,6.999135,6.576763,4.02,25616.11549,1.482618e+06,0,1,0
4997,63390.68689,7.250591,4.805081,2.13,33266.14549,1.030730e+06,0,0,1
4998,68001.33124,5.534388,7.130144,5.44,42625.62016,1.198657e+06,0,1,0


# previsão

In [7]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Random Forest
rf_params = {'n_estimators': [100, 200, 500], 'max_depth': [5, 10,15]}

# XGBoost
xgb_params = {'n_estimators': [50, 100, 150], 'max_depth': [3, 4, 5]}

# LightGBM
lgb_params = {'n_estimators': [50, 100, 150], 'max_depth': [3, 4, 5]}

# # Random Forest
# rf_params = {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, 30]}

# # XGBoost
# xgb_params = {'n_estimators': [100, 200, 500], 'max_depth': [5, 10, 15]}

# # LightGBM
# lgb_params = {'n_estimators': [100, 200, 500], 'max_depth': [5, 10, 15]}

rf = RandomForestRegressor(random_state=random_state)
xgb_model = xgb.XGBRegressor(random_state=random_state)
lgb_model = lgb.LGBMRegressor(random_state=random_state)

models = [
    {'estimator': rf, 'params': rf_params, 'name': 'Random Forest'},
    {'estimator': xgb_model, 'params': xgb_params, 'name': 'XGBoost'},
    {'estimator': lgb_model, 'params': lgb_params, 'name': 'LightGBM'}
]

# Definindo os scalers
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]

best_model = None
best_score = float('inf')

for model in models:
    for scaler in scalers:
        pipeline = Pipeline([
            ('scaler', scaler),
            ('estimator', model['estimator'])
        ])

        param_grid = {f'estimator__{k}': v for k, v in model['params'].items()}

        grid_search = GridSearchCV(pipeline, param_grid, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False), cv=5)
        grid_search.fit(X_train, y_train)

        if -grid_search.best_score_ < best_score:
            best_score = -grid_search.best_score_
            best_model_grid = grid_search
            
        print(50*'-')
        print(model['name'])
        print('Scaler:', scaler)
        print('RMSE:', -grid_search.best_score_)
        print(50*'-')

print(best_model_grid)
print(best_model_grid.best_params_)

--------------------------------------------------
Random Forest
Scaler: StandardScaler()
RMSE: 121196.61662369556
--------------------------------------------------
--------------------------------------------------
Random Forest
Scaler: MinMaxScaler()
RMSE: 121197.78161553736
--------------------------------------------------
--------------------------------------------------
Random Forest
Scaler: RobustScaler()
RMSE: 121197.24501149773
--------------------------------------------------
--------------------------------------------------
XGBoost
Scaler: StandardScaler()
RMSE: 115761.49653216828
--------------------------------------------------
--------------------------------------------------
XGBoost
Scaler: MinMaxScaler()
RMSE: 115761.49653216828
--------------------------------------------------
--------------------------------------------------
XGBoost
Scaler: RobustScaler()
RMSE: 115761.49653216828
--------------------------------------------------
------------------------------

# plot

In [8]:
# Fazendo previsões
y_train_pred = best_model_grid.predict(X_train)
y_test_pred = best_model_grid.predict(X_test)
y_all_pred = np.concatenate([y_train_pred, y_test_pred])

# Criando o gráfico para os dados de treino e teste
fig1 = go.Figure()

fig1.add_trace(go.Scatter(y=y_test, mode='lines', name='Test Data'))
fig1.add_trace(go.Scatter(y=best_model_grid.predict(X_test), mode='lines', name='Test Prediction'))

fig1.update_layout(title='Train and Test Data vs Predictions', xaxis_title='Index', yaxis_title='Value')

fig1.show()

# Criando o gráfico para os dados originais e previsões
fig2 = go.Figure()

fig2.add_trace(go.Scatter(y=y, mode='lines', name='Original Data'))
fig2.add_trace(go.Scatter(y=best_model_grid.predict(X), mode='lines', name='All Predictions'))

fig2.update_layout(title='Original Data vs All Predictions', xaxis_title='Index', yaxis_title='Value')

fig2.show()
