# üìä An√°lise Explorat√≥ria de Dados - Im√≥veis Rurais

**Autor:** Marcos Paulo Roriz Lima Reis  
**RA:** 22007534  
**Email:** marcos.paulor@sempreceub.com  
**Curso:** Engenharia da Computa√ß√£o - UniCEUB  

## üéØ Objetivos
- Analisar dados de im√≥veis rurais coletados
- Identificar padr√µes e correla√ß√µes
- Detectar e tratar outliers
- Preparar dados para modelagem

In [None]:
# Importa√ß√£o das bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configura√ß√£o dos gr√°ficos
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

print("‚úÖ Bibliotecas importadas com sucesso!")

In [None]:
# Carregamento dos dados
data_path = Path('../data/imoveis_rurais.csv')
df = pd.read_csv(data_path)

print(f"üìä Dataset carregado com {len(df)} registros e {len(df.columns)} colunas")
print(f"üìè Dimens√µes: {df.shape}")
df.head()

In [None]:
# Informa√ß√µes gerais do dataset
print("üìã Informa√ß√µes do Dataset:")
print("=" * 40)
df.info()

print("\nüìä Estat√≠sticas Descritivas:")
print("=" * 40)
df.describe()

In [None]:
# Verifica√ß√£o de valores ausentes
print("üîç Valores Ausentes:")
print("=" * 40)
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Valores Ausentes': missing_values,
    'Percentual (%)': missing_percent
})

missing_df = missing_df[missing_df['Valores Ausentes'] > 0].sort_values('Valores Ausentes', ascending=False)
print(missing_df)

In [None]:
# Visualiza√ß√£o da distribui√ß√£o de pre√ßos
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Histograma de pre√ßos
axes[0,0].hist(df['preco'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('üìä Distribui√ß√£o de Pre√ßos')
axes[0,0].set_xlabel('Pre√ßo (R$)')
axes[0,0].set_ylabel('Frequ√™ncia')

# Boxplot de pre√ßos
axes[0,1].boxplot(df['preco'])
axes[0,1].set_title('üì¶ Boxplot de Pre√ßos')
axes[0,1].set_ylabel('Pre√ßo (R$)')

# Histograma de √°rea
axes[1,0].hist(df['area_ha'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1,0].set_title('üìä Distribui√ß√£o de √Åreas')
axes[1,0].set_xlabel('√Årea (hectares)')
axes[1,0].set_ylabel('Frequ√™ncia')

# Boxplot de √°rea
axes[1,1].boxplot(df['area_ha'])
axes[1,1].set_title('üì¶ Boxplot de √Åreas')
axes[1,1].set_ylabel('√Årea (hectares)')

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Pre√ßo vs √Årea
plt.figure(figsize=(12, 8))
plt.scatter(df['area_ha'], df['preco'], alpha=0.6, color='coral')
plt.title('üè† Rela√ß√£o entre √Årea e Pre√ßo dos Im√≥veis Rurais')
plt.xlabel('√Årea (hectares)')
plt.ylabel('Pre√ßo (R$)')
plt.grid(True, alpha=0.3)

# Linha de tend√™ncia
z = np.polyfit(df['area_ha'], df['preco'], 1)
p = np.poly1d(z)
plt.plot(df['area_ha'], p(df['area_ha']), "r--", alpha=0.8, label='Linha de Tend√™ncia')
plt.legend()
plt.show()

# Correla√ß√£o
correlacao = df['area_ha'].corr(df['preco'])
print(f"üìà Correla√ß√£o √Årea vs Pre√ßo: {correlacao:.3f}")

In [None]:
# Matriz de correla√ß√£o
# Selecionar apenas colunas num√©ricas
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            square=True,
            fmt='.2f')
plt.title('üî• Matriz de Correla√ß√£o - Vari√°veis Num√©ricas')
plt.tight_layout()
plt.show()

print("\nüîç Correla√ß√µes mais fortes com pre√ßo:")
price_corr = correlation_matrix['preco'].sort_values(ascending=False)
for var, corr in price_corr.items():
    if var != 'preco':
        print(f"  {var}: {corr:.3f}")

In [None]:
# An√°lise de outliers usando IQR
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Detectar outliers em pre√ßo
price_outliers, price_lower, price_upper = detect_outliers_iqr(df, 'preco')
print(f"üí∞ Outliers em Pre√ßo: {len(price_outliers)} registros")
print(f"   Limites: R$ {price_lower:,.2f} - R$ {price_upper:,.2f}")

# Detectar outliers em √°rea
area_outliers, area_lower, area_upper = detect_outliers_iqr(df, 'area_ha')
print(f"üèûÔ∏è Outliers em √Årea: {len(area_outliers)} registros")
print(f"   Limites: {area_lower:.2f} - {area_upper:.2f} hectares")

# Mostrar alguns exemplos de outliers
if len(price_outliers) > 0:
    print("\nüí∏ Exemplos de outliers em pre√ßo:")
    print(price_outliers[['preco', 'area_ha', 'cidade']].head())

In [None]:
# Limpeza de dados - remover outliers extremos
print("üßπ Iniciando limpeza de dados...")
print(f"   Dataset original: {len(df)} registros")

# Remover outliers extremos (percentis 5% e 95%)
price_p5 = df['preco'].quantile(0.05)
price_p95 = df['preco'].quantile(0.95)
area_p5 = df['area_ha'].quantile(0.05)
area_p95 = df['area_ha'].quantile(0.95)

df_clean = df[
    (df['preco'] >= price_p5) & (df['preco'] <= price_p95) &
    (df['area_ha'] >= area_p5) & (df['area_ha'] <= area_p95)
].copy()

print(f"   Dataset limpo: {len(df_clean)} registros")
print(f"   Registros removidos: {len(df) - len(df_clean)}")
print(f"   Percentual mantido: {len(df_clean)/len(df)*100:.1f}%")

In [None]:
# Compara√ß√£o antes e depois da limpeza
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Pre√ßo - antes
axes[0,0].hist(df['preco'], bins=30, alpha=0.7, color='red', edgecolor='black')
axes[0,0].set_title('üìä Pre√ßos - Antes da Limpeza')
axes[0,0].set_xlabel('Pre√ßo (R$)')

# Pre√ßo - depois
axes[0,1].hist(df_clean['preco'], bins=30, alpha=0.7, color='green', edgecolor='black')
axes[0,1].set_title('‚úÖ Pre√ßos - Ap√≥s Limpeza')
axes[0,1].set_xlabel('Pre√ßo (R$)')

# √Årea - antes
axes[1,0].hist(df['area_ha'], bins=30, alpha=0.7, color='red', edgecolor='black')
axes[1,0].set_title('üìä √Årea - Antes da Limpeza')
axes[1,0].set_xlabel('√Årea (hectares)')

# √Årea - depois
axes[1,1].hist(df_clean['area_ha'], bins=30, alpha=0.7, color='green', edgecolor='black')
axes[1,1].set_title('‚úÖ √Årea - Ap√≥s Limpeza')
axes[1,1].set_xlabel('√Årea (hectares)')

plt.tight_layout()
plt.show()

In [None]:
# Estat√≠sticas finais
print("üìà Estat√≠sticas Finais - Dataset Limpo:")
print("=" * 50)
print(f"üìä Registros: {len(df_clean)}")
print(f"üí∞ Pre√ßo m√©dio: R$ {df_clean['preco'].mean():,.2f}")
print(f"üí∞ Pre√ßo mediano: R$ {df_clean['preco'].median():,.2f}")
print(f"üèûÔ∏è √Årea m√©dia: {df_clean['area_ha'].mean():.2f} hectares")
print(f"üèûÔ∏è √Årea mediana: {df_clean['area_ha'].median():.2f} hectares")
print(f"üìà Correla√ß√£o √Årea vs Pre√ßo: {df_clean['area_ha'].corr(df_clean['preco']):.3f}")

# Salvar dataset limpo
output_path = Path('../data/imoveis_rurais_tratados.csv')
df_clean.to_csv(output_path, index=False)
print(f"\nüíæ Dataset limpo salvo em: {output_path}")
print("‚úÖ An√°lise explorat√≥ria conclu√≠da!")