# ü§ñ Modelo de Machine Learning - Predi√ß√£o de Pre√ßos

**Autor:** Marcos Paulo Roriz Lima Reis  
**RA:** 22007534  
**Email:** marcos.paulor@sempreceub.com  
**Curso:** Engenharia da Computa√ß√£o - UniCEUB  

## üéØ Objetivos
- Implementar modelos de regress√£o para predi√ß√£o de pre√ßos
- Comparar performance de diferentes algoritmos
- Avaliar qualidade das predi√ß√µes
- Fazer predi√ß√µes para novos im√≥veis

In [None]:
# Importa√ß√£o das bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Configura√ß√£o dos gr√°ficos
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

print("‚úÖ Bibliotecas importadas com sucesso!")

In [None]:
# Carregamento dos dados tratados
data_path = Path('../data/imoveis_rurais_tratados.csv')
df = pd.read_csv(data_path)

print(f"üìä Dataset carregado com {len(df)} registros")
print(f"üìè Dimens√µes: {df.shape}")
df.head()

In [None]:
# Prepara√ß√£o dos dados para modelagem
print("üîß Preparando dados para modelagem...")

# Selecionar features num√©ricas
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
if 'preco' in numeric_features:
    numeric_features.remove('preco')  # Remover target

print(f"üìã Features num√©ricas dispon√≠veis: {numeric_features}")

# Verificar dados ausentes
missing_data = df[numeric_features + ['preco']].isnull().sum()
print(f"\nüîç Dados ausentes:")
print(missing_data[missing_data > 0])

# Remover registros com dados ausentes
df_model = df[numeric_features + ['preco']].dropna()
print(f"\nüìä Dataset para modelagem: {len(df_model)} registros")

In [None]:
# Definir vari√°veis X e y
X = df_model[numeric_features]
y = df_model['preco']

print(f"üìã Features (X): {list(X.columns)}")
print(f"üéØ Target (y): preco")
print(f"üìä Shape X: {X.shape}")
print(f"üìä Shape y: {y.shape}")

# Estat√≠sticas das features
print("\nüìà Estat√≠sticas das Features:")
print(X.describe())

In [None]:
# Divis√£o em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"üìä Conjunto de treino: {X_train.shape[0]} registros")
print(f"üìä Conjunto de teste: {X_test.shape[0]} registros")
print(f"üìä Propor√ß√£o treino/teste: {X_train.shape[0]/X_test.shape[0]:.1f}:1")

In [None]:
# Normaliza√ß√£o dos dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚öñÔ∏è Dados normalizados com StandardScaler")
print(f"üìä M√©dia das features (treino): {np.mean(X_train_scaled, axis=0)}")
print(f"üìä Desvio padr√£o das features (treino): {np.std(X_train_scaled, axis=0)}")

In [None]:
# Modelo 1: Regress√£o Linear
print("ü§ñ Treinando Modelo de Regress√£o Linear...")

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predi√ß√µes
y_pred_lr_train = lr_model.predict(X_train_scaled)
y_pred_lr_test = lr_model.predict(X_test_scaled)

# M√©tricas
lr_r2_train = r2_score(y_train, y_pred_lr_train)
lr_r2_test = r2_score(y_test, y_pred_lr_test)
lr_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_lr_train))
lr_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_lr_test))
lr_mae_test = mean_absolute_error(y_test, y_pred_lr_test)

print(f"\nüìà Regress√£o Linear - Resultados:")
print(f"   R¬≤ Treino: {lr_r2_train:.4f}")
print(f"   R¬≤ Teste: {lr_r2_test:.4f}")
print(f"   RMSE Treino: R$ {lr_rmse_train:,.2f}")
print(f"   RMSE Teste: R$ {lr_rmse_test:,.2f}")
print(f"   MAE Teste: R$ {lr_mae_test:,.2f}")

In [None]:
# Modelo 2: Random Forest
print("üå≤ Treinando Modelo Random Forest...")

rf_model = RandomForestRegressor(
    n_estimators=100, 
    random_state=42,
    max_depth=10,
    min_samples_split=5
)
rf_model.fit(X_train, y_train)  # Random Forest n√£o precisa de normaliza√ß√£o

# Predi√ß√µes
y_pred_rf_train = rf_model.predict(X_train)
y_pred_rf_test = rf_model.predict(X_test)

# M√©tricas
rf_r2_train = r2_score(y_train, y_pred_rf_train)
rf_r2_test = r2_score(y_test, y_pred_rf_test)
rf_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_rf_train))
rf_rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_rf_test))
rf_mae_test = mean_absolute_error(y_test, y_pred_rf_test)

print(f"\nüå≤ Random Forest - Resultados:")
print(f"   R¬≤ Treino: {rf_r2_train:.4f}")
print(f"   R¬≤ Teste: {rf_r2_test:.4f}")
print(f"   RMSE Treino: R$ {rf_rmse_train:,.2f}")
print(f"   RMSE Teste: R$ {rf_rmse_test:,.2f}")
print(f"   MAE Teste: R$ {rf_mae_test:,.2f}")

In [None]:
# Compara√ß√£o dos modelos
comparison_data = {
    'Modelo': ['Regress√£o Linear', 'Random Forest'],
    'R¬≤ Treino': [lr_r2_train, rf_r2_train],
    'R¬≤ Teste': [lr_r2_test, rf_r2_test],
    'RMSE Teste': [lr_rmse_test, rf_rmse_test],
    'MAE Teste': [lr_mae_test, rf_mae_test]
}

comparison_df = pd.DataFrame(comparison_data)
print("üèÜ Compara√ß√£o dos Modelos:")
print("=" * 60)
print(comparison_df.to_string(index=False, float_format='%.4f'))

# Identificar melhor modelo
best_model_idx = comparison_df['R¬≤ Teste'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Modelo']
print(f"\nü•á Melhor modelo: {best_model_name}")

In [None]:
# Visualiza√ß√£o das predi√ß√µes
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Regress√£o Linear
axes[0].scatter(y_test, y_pred_lr_test, alpha=0.6, color='blue')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Pre√ßo Real (R$)')
axes[0].set_ylabel('Pre√ßo Predito (R$)')
axes[0].set_title(f'üìà Regress√£o Linear\nR¬≤ = {lr_r2_test:.3f}')
axes[0].grid(True, alpha=0.3)

# Random Forest
axes[1].scatter(y_test, y_pred_rf_test, alpha=0.6, color='green')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Pre√ßo Real (R$)')
axes[1].set_ylabel('Pre√ßo Predito (R$)')
axes[1].set_title(f'üå≤ Random Forest\nR¬≤ = {rf_r2_test:.3f}')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Import√¢ncia das features (Random Forest)
if len(X.columns) > 1:
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Import√¢ncia': rf_model.feature_importances_
    }).sort_values('Import√¢ncia', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='Import√¢ncia', y='Feature', palette='viridis')
    plt.title('üå≤ Import√¢ncia das Features - Random Forest')
    plt.xlabel('Import√¢ncia')
    plt.tight_layout()
    plt.show()

    print("üìä Ranking de Import√¢ncia das Features:")
    for idx, row in feature_importance.iterrows():
        print(f"   {row['Feature']}: {row['Import√¢ncia']:.4f}")
else:
    print("üìä Apenas uma feature dispon√≠vel para modelagem")

In [None]:
# An√°lise de res√≠duos
residuals_lr = y_test - y_pred_lr_test
residuals_rf = y_test - y_pred_rf_test

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Res√≠duos vs Predi√ß√µes - Linear
axes[0,0].scatter(y_pred_lr_test, residuals_lr, alpha=0.6, color='blue')
axes[0,0].axhline(y=0, color='r', linestyle='--')
axes[0,0].set_xlabel('Predi√ß√µes')
axes[0,0].set_ylabel('Res√≠duos')
axes[0,0].set_title('üìà Res√≠duos - Regress√£o Linear')
axes[0,0].grid(True, alpha=0.3)

# Histograma dos res√≠duos - Linear
axes[0,1].hist(residuals_lr, bins=20, alpha=0.7, color='blue', edgecolor='black')
axes[0,1].set_xlabel('Res√≠duos')
axes[0,1].set_ylabel('Frequ√™ncia')
axes[0,1].set_title('üìä Distribui√ß√£o dos Res√≠duos - Linear')

# Res√≠duos vs Predi√ß√µes - Random Forest
axes[1,0].scatter(y_pred_rf_test, residuals_rf, alpha=0.6, color='green')
axes[1,0].axhline(y=0, color='r', linestyle='--')
axes[1,0].set_xlabel('Predi√ß√µes')
axes[1,0].set_ylabel('Res√≠duos')
axes[1,0].set_title('üå≤ Res√≠duos - Random Forest')
axes[1,0].grid(True, alpha=0.3)

# Histograma dos res√≠duos - Random Forest
axes[1,1].hist(residuals_rf, bins=20, alpha=0.7, color='green', edgecolor='black')
axes[1,1].set_xlabel('Res√≠duos')
axes[1,1].set_ylabel('Frequ√™ncia')
axes[1,1].set_title('üìä Distribui√ß√£o dos Res√≠duos - RF')

plt.tight_layout()
plt.show()

In [None]:
# Fun√ß√£o para fazer predi√ß√µes
def predict_price(model, scaler, **kwargs):
    """Fun√ß√£o para predizer pre√ßo de um im√≥vel"""
    # Criar DataFrame com as features
    input_data = pd.DataFrame([kwargs])
    
    # Garantir que todas as features est√£o presentes
    for col in X.columns:
        if col not in input_data.columns:
            input_data[col] = 0  # Valor padr√£o
    
    # Reordenar colunas
    input_data = input_data[X.columns]
    
    # Fazer predi√ß√£o
    if model == rf_model:
        prediction = model.predict(input_data)[0]
    else:  # Linear Regression
        input_scaled = scaler.transform(input_data)
        prediction = model.predict(input_scaled)[0]
    
    return prediction

# Exemplo de predi√ß√£o
print("üîÆ Exemplo de Predi√ß√£o:")
print("=" * 40)

# Usar valores m√©dios das features como exemplo
example_features = {}
for col in X.columns:
    example_features[col] = X[col].mean()

print(f"üìã Features do exemplo:")
for feature, value in example_features.items():
    print(f"   {feature}: {value:.2f}")

# Predi√ß√µes
pred_lr = predict_price(lr_model, scaler, **example_features)
pred_rf = predict_price(rf_model, scaler, **example_features)

print(f"\nüí∞ Predi√ß√µes de Pre√ßo:")
print(f"   Regress√£o Linear: R$ {pred_lr:,.2f}")
print(f"   Random Forest: R$ {pred_rf:,.2f}")
print(f"   Pre√ßo m√©dio real: R$ {y.mean():,.2f}")

In [None]:
# Valida√ß√£o cruzada
print("üîÑ Realizando Valida√ß√£o Cruzada...")

# Linear Regression
cv_scores_lr = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"\nüìà Regress√£o Linear - CV:")
print(f"   R¬≤ m√©dio: {cv_scores_lr.mean():.4f} (¬±{cv_scores_lr.std()*2:.4f})")
print(f"   R¬≤ por fold: {cv_scores_lr}")

# Random Forest
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='r2')
print(f"\nüå≤ Random Forest - CV:")
print(f"   R¬≤ m√©dio: {cv_scores_rf.mean():.4f} (¬±{cv_scores_rf.std()*2:.4f})")
print(f"   R¬≤ por fold: {cv_scores_rf}")

In [None]:
# Resumo final
print("\n" + "="*60)
print("üéØ RESUMO FINAL DOS MODELOS")
print("="*60)

print(f"\nüìä Dataset:")
print(f"   Total de registros: {len(df_model)}")
print(f"   Features utilizadas: {len(X.columns)}")
print(f"   Treino/Teste: {len(X_train)}/{len(X_test)}")

print(f"\nüèÜ Melhor Modelo: {best_model_name}")
if best_model_name == 'Random Forest':
    print(f"   R¬≤ Teste: {rf_r2_test:.4f}")
    print(f"   RMSE: R$ {rf_rmse_test:,.2f}")
    print(f"   MAE: R$ {rf_mae_test:,.2f}")
else:
    print(f"   R¬≤ Teste: {lr_r2_test:.4f}")
    print(f"   RMSE: R$ {lr_rmse_test:,.2f}")
    print(f"   MAE: R$ {lr_mae_test:,.2f}")

print(f"\nüí° Interpreta√ß√£o:")
if max(rf_r2_test, lr_r2_test) > 0.7:
    print("   ‚úÖ Modelo com boa capacidade preditiva")
elif max(rf_r2_test, lr_r2_test) > 0.4:
    print("   ‚ö†Ô∏è Modelo com capacidade preditiva moderada")
else:
    print("   ‚ùå Modelo com baixa capacidade preditiva")

print("\n‚úÖ Modelagem conclu√≠da com sucesso!")