# Regress√£o Linear

Neste notebook, vamos explorar a regress√£o linear em detalhes:
- Visualiza√ß√µes principais
- Dispers√£o para m√∫ltiplas vari√°veis
- An√°lise de res√≠duos
- Predi√ß√£o de pre√ßos

In [None]:
# Importar bibliotecas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
sns.set(style="whitegrid")

# Configurar exibi√ß√£o de floats
np.set_printoptions(precision=4, suppress=True)

# Carregar dados
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv"
column_names = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
data = pd.read_csv(url, header=None, delimiter=", ")
data.columns = column_names

# Visualizar as 5 primeiras linhas
data.head()

## 1. Estat√≠sticas Descritivas

Vamos come√ßar explorando estat√≠sticas descritivas dos dados.

In [None]:
# Estat√≠sticas descritivas
stats = data.describe().T
stats["IQR"] = stats["75%"] - stats["25%"]  # Intervalo Interquartil
stats["Outliers"] = np.where((data < (stats["25%"] - 1.5 * stats["IQR"])) | (data > (stats["75%"] + 1.5 * stats["IQR"])), "Sim", "N√£o")

# Exibir estat√≠sticas
stats

## 2. Visualiza√ß√£o da Distribui√ß√£o das Vari√°veis

Vamos visualizar a distribui√ß√£o das vari√°veis num√©ricas.

In [None]:
# Histograma e KDE
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(16, 12))
fig.suptitle('Distribui√ß√£o das Vari√°veis', fontsize=16, fontweight='bold')

for ax, column in zip(axes.flatten(), data.columns):
    sns.histplot(data[column], kde=True, ax=ax, color='skyblue', bins=30)
    ax.set_title(column, fontsize=12, fontweight='bold')
    ax.set_xlabel('')
    ax.set_ylabel('')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

## 3. Correla√ß√µes entre Vari√°veis

Analisando correla√ß√µes para entender rela√ß√µes entre vari√°veis.

In [None]:
# Matriz de correla√ß√£o
correlation_matrix = data.corr()

# Heatmap da matriz de correla√ß√£o
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=.5)

# T√≠tulo
plt.title('Mapa de Calor da Correla√ß√£o entre Vari√°veis', fontsize=16, fontweight='bold')

# Exibir
plt.show()

## 4. Prepara√ß√£o dos Dados para Regress√£o

Nesta etapa, vamos preparar os dados para o modelo de regress√£o linear.

In [None]:
# Dividir em vari√°veis explicativas (X) e vari√°vel alvo (y)
X = data.drop("MEDV", axis=1)
y = data["MEDV"]

# Dividir em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalizar os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Criar o modelo de regress√£o linear
model = LinearRegression()

# Treinar o modelo
model.fit(X_train_scaled, y_train)

# Fazer predi√ß√µes
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

# Avaliar o modelo
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

# Resultados
print(f"Conjunto de Treino: MSE = {mse_train:.4f}, R¬≤ = {r2_train:.4f}")
print(f"Conjunto de Teste: MSE = {mse_test:.4f}, R¬≤ = {r2_test:.4f}")

## 5. Visualiza√ß√µes das Predi√ß√µes

Vamos visualizar as predi√ß√µes do modelo.

In [None]:
# Gr√°fico: Valores Reais vs Preditos
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.7, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Valores Reais')
plt.ylabel('Valores Preditos')
plt.title('Valores Reais vs Preditos')
plt.grid(True)
plt.show()

## 6. Visualiza√ß√µes Principais

Vamos criar visualiza√ß√µes para entender melhor o modelo:
1. Valores Reais vs Preditos
2. Dispers√£o para M√∫ltiplas Vari√°veis (Top 4 Features)
3. Import√¢ncia das Features

In [None]:
# Calcular predi√ß√µes
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

# Criar figura com subplots
fig = plt.figure(figsize=(20, 6))

# Gr√°fico 1: Valores Reais vs Preditos
ax1 = plt.subplot(1, 3, 1)
ax1.scatter(y_test, y_pred_test, alpha=0.5, edgecolors='k', linewidth=0.5)
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
ax1.set_xlabel('Valores Reais', fontsize=12, fontweight='bold')
ax1.set_ylabel('Valores Preditos', fontsize=12, fontweight='bold')
ax1.set_title('Valores Reais vs Preditos\n(Linha vermelha = predi√ß√£o perfeita)', fontsize=13, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Gr√°fico 2: Dispers√£o da Feature Mais Importante
ax2 = plt.subplot(1, 3, 2)
feature_importance = np.abs(model.coef_)
most_important_idx = np.argmax(feature_importance)
most_important_feature = X.columns[most_important_idx]

ax2.scatter(X_test_scaled[:, most_important_idx], y_test, alpha=0.5, label='Real', color='blue', s=30)
ax2.scatter(X_test_scaled[:, most_important_idx], y_pred_test, alpha=0.5, label='Predito', color='red', s=30)
ax2.set_xlabel(most_important_feature, fontsize=12, fontweight='bold')
ax2.set_ylabel('Pre√ßo (MEDV)', fontsize=12, fontweight='bold')
ax2.set_title(f'Feature Mais Importante: {most_important_feature}\nCoeficiente: {model.coef_[most_important_idx]:.4f}', 
              fontsize=13, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Gr√°fico 3: Import√¢ncia das Features (Top 10)
ax3 = plt.subplot(1, 3, 3)
top_n = 10
top_indices = np.argsort(feature_importance)[-top_n:][::-1]
top_features = [X.columns[i] for i in top_indices]
top_weights = feature_importance[top_indices]

colors = plt.cm.viridis(np.linspace(0, 1, top_n))
bars = ax3.barh(range(top_n), top_weights, color=colors, edgecolor='black')
ax3.set_yticks(range(top_n))
ax3.set_yticklabels(top_features)
ax3.set_xlabel('Import√¢ncia (|Coeficiente|)', fontsize=12, fontweight='bold')
ax3.set_title(f'Top {top_n} Features Mais Importantes', fontsize=13, fontweight='bold')
ax3.grid(True, alpha=0.3, axis='x')
ax3.invert_yaxis()

plt.tight_layout()
plt.show()

## 7. Dispers√£o para M√∫ltiplas Vari√°veis

An√°lise detalhada das 4 features mais importantes.

In [None]:
# Selecionar as 4 features mais importantes
feature_importance = np.abs(model.coef_)
top_features_idx = np.argsort(feature_importance)[-4:][::-1]
top_features = [X.columns[i] for i in top_features_idx]

# Criar subplot para cada feature
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('An√°lise de Dispers√£o: Features Principais vs Pre√ßo', fontsize=16, fontweight='bold')

for idx, (ax, feature_idx, feature_name) in enumerate(zip(axes.flat, top_features_idx, top_features)):
    # Dados de treino e teste (valores originais, n√£o escalados)
    x_train_feat = X_train.iloc[:, feature_idx].values
    x_test_feat = X_test.iloc[:, feature_idx].values
    
    # Scatter plot
    ax.scatter(x_train_feat, y_train, alpha=0.3, s=20, label='Treino', color='blue', edgecolors='none')
    ax.scatter(x_test_feat, y_test, alpha=0.3, s=20, label='Teste', color='green', edgecolors='none')
    
    # Linha de tend√™ncia
    z = np.polyfit(x_train_feat, y_train, 1)
    p = np.poly1d(z)
    x_line = np.linspace(x_train_feat.min(), x_train_feat.max(), 100)
    ax.plot(x_line, p(x_line), "r--", linewidth=2, label='Tend√™ncia Linear')
    
    # Calcular correla√ß√£o
    correlation = np.corrcoef(x_train_feat, y_train)[0, 1]
    
    ax.set_xlabel(feature_name, fontsize=11, fontweight='bold')
    ax.set_ylabel('Pre√ßo (MEDV)', fontsize=11, fontweight='bold')
    ax.set_title(f'{feature_name}\nCorrela√ß√£o: {correlation:.3f} | Coef: {model.coef_[feature_idx]:.3f}', 
                 fontsize=12, fontweight='bold')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüìä Interpreta√ß√£o das Features Principais:")
for i, (feature_idx, feature_name) in enumerate(zip(top_features_idx, top_features)):
    coef = model.coef_[feature_idx]
    correlation = np.corrcoef(X_train.iloc[:, feature_idx].values, y_train)[0, 1]
    print(f"{i+1}. {feature_name}:")
    print(f"   Coeficiente: {coef:.4f} | Correla√ß√£o: {correlation:.4f}")
    print(f"   {'Impacto positivo' if coef > 0 else 'Impacto negativo'} no pre√ßo\n")

## 8. üéÅ B√îNUS: An√°lise de Res√≠duos

A an√°lise de res√≠duos √© fundamental para validar as premissas da regress√£o linear:
1. **Linearidade**: Rela√ß√£o linear entre X e y
2. **Homocedasticidade**: Vari√¢ncia constante dos erros
3. **Normalidade**: Res√≠duos seguem distribui√ß√£o normal
4. **Independ√™ncia**: Aus√™ncia de padr√µes nos res√≠duos

### O que s√£o Res√≠duos?
Res√≠duos s√£o as diferen√ßas entre os valores reais e preditos: $e_i = y_i - \hat{y}_i$

In [None]:
from scipy import stats

# Calcular res√≠duos
residuals = y_test - y_pred_test

# Criar figura com 4 subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('üéÅ AN√ÅLISE DE RES√çDUOS - Valida√ß√£o das Premissas da Regress√£o Linear', 
             fontsize=16, fontweight='bold', y=1.00)

# 1. Res√≠duos vs Valores Preditos
axes[0, 0].scatter(y_pred_test, residuals, alpha=0.5, edgecolors='k', linewidth=0.5, s=30)
axes[0, 0].axhline(y=0, color='r', linestyle='--', linewidth=2, label='Res√≠duo = 0')
axes[0, 0].set_xlabel('Valores Preditos', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Res√≠duos', fontsize=12, fontweight='bold')
axes[0, 0].set_title('1. Res√≠duos vs Predi√ß√µes\n‚úì Padr√£o aleat√≥rio indica homocedasticidade', 
                     fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Histograma dos Res√≠duos com Curva Normal
axes[0, 1].hist(residuals, bins=50, density=True, alpha=0.7, color='skyblue', edgecolor='black', label='Res√≠duos')
axes[0, 1].axvline(x=0, color='r', linestyle='--', linewidth=2, label='M√©dia = 0')

# Sobrepor curva normal
mu, sigma = residuals.mean(), residuals.std()
x = np.linspace(residuals.min(), residuals.max(), 100)
axes[0, 1].plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label=f'Normal(Œº={mu:.2f}, œÉ={sigma:.2f})')

axes[0, 1].set_xlabel('Res√≠duos', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Densidade', fontsize=12, fontweight='bold')
axes[0, 1].set_title('2. Distribui√ß√£o dos Res√≠duos\n‚úì Deve seguir distribui√ß√£o normal', 
                     fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Q-Q Plot (Quantil-Quantil)
stats.probplot(residuals, dist="norm", plot=axes[1, 0])
axes[1, 0].get_lines()[0].set_markerfacecolor('blue')
axes[1, 0].get_lines()[0].set_markeredgecolor('black')
axes[1, 0].get_lines()[0].set_markersize(5)
axes[1, 0].get_lines()[1].set_color('red')
axes[1, 0].get_lines()[1].set_linewidth(2)
axes[1, 0].set_title('3. Q-Q Plot\n‚úì Pontos na linha diagonal indicam normalidade', 
                     fontsize=12, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# 4. Res√≠duos Padronizados (Scale-Location Plot)
standardized_residuals = (residuals - np.mean(residuals)) / np.std(residuals)
axes[1, 1].scatter(y_pred_test, np.abs(standardized_residuals), alpha=0.5, edgecolors='k', linewidth=0.5, s=30)
axes[1, 1].axhline(y=0, color='gray', linestyle='-', linewidth=1)
axes[1, 1].axhline(y=2, color='orange', linestyle=':', linewidth=2, label='¬±2œÉ (95%)')
axes[1, 1].axhline(y=3, color='red', linestyle=':', linewidth=2, label='¬±3œÉ (99.7%)')

# Adicionar linha de tend√™ncia
from scipy.interpolate import make_interp_spline
x_smooth = np.linspace(y_pred_test.min(), y_pred_test.max(), 100)
indices = np.argsort(y_pred_test)
try:
    spl = make_interp_spline(y_pred_test[indices][::10], np.abs(standardized_residuals)[indices][::10], k=3)
    y_smooth = spl(x_smooth)
    axes[1, 1].plot(x_smooth, y_smooth, 'b-', linewidth=2, alpha=0.5, label='Tend√™ncia')
except:
    pass

axes[1, 1].set_xlabel('Valores Preditos', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('|Res√≠duos Padronizados|', fontsize=12, fontweight='bold')
axes[1, 1].set_title('4. Scale-Location Plot\n‚úì Linha horizontal indica vari√¢ncia constante', 
                     fontsize=12, fontweight='bold')
axes[1, 1].legend(loc='upper right')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Estat√≠sticas dos res√≠duos
print("\n" + "="*70)
print("üìä ESTAT√çSTICAS DOS RES√çDUOS")
print("="*70)
print(f"{'M√©trica':<30} {'Valor':>15} {'Interpreta√ß√£o'}")
print("-"*70)
print(f"{'M√©dia':<30} {np.mean(residuals):>15.6f} {'‚úì Pr√≥ximo de 0' if abs(np.mean(residuals)) < 0.01 else '‚úó Distante de 0'}")
print(f"{'Desvio Padr√£o':<30} {np.std(residuals):>15.4f}")
print(f"{'M√≠nimo':<30} {np.min(residuals):>15.4f}")
print(f"{'M√°ximo':<30} {np.max(residuals):>15.4f}")
print(f"{'Mediana':<30} {np.median(residuals):>15.4f} {'‚úì Pr√≥ximo de 0' if abs(np.median(residuals)) < 0.01 else '‚úó Distante de 0'}")

# Teste de normalidade (Shapiro-Wilk)
sample_size = min(5000, len(residuals))
sample_residuals = np.random.choice(residuals, sample_size, replace=False)
statistic, p_value = stats.shapiro(sample_residuals)

print("\n" + "="*70)
print("üî¨ TESTE DE NORMALIDADE (Shapiro-Wilk)")
print("="*70)
print(f"Estat√≠stica W: {statistic:.6f}")
print(f"p-valor: {p_value:.6f}")
print(f"Conclus√£o (Œ±=0.05): {'‚úì Res√≠duos s√£o normais' if p_value > 0.05 else '‚úó Res√≠duos n√£o s√£o perfeitamente normais'}")
print(f"\nInterpreta√ß√£o: {'Os res√≠duos seguem distribui√ß√£o normal' if p_value > 0.05 else 'H√° desvios da normalidade'}")

# Contar outliers
outliers_2sigma = np.sum(np.abs(standardized_residuals) > 2)
outliers_3sigma = np.sum(np.abs(standardized_residuals) > 3)
total = len(residuals)

print("\n" + "="*70)
print("‚ö†Ô∏è DETEC√á√ÉO DE OUTLIERS")
print("="*70)
print(f"Total de observa√ß√µes: {total}")
print(f"Al√©m de ¬±2œÉ (esperado ~5%): {outliers_2sigma} ({outliers_2sigma/total*100:.2f}%)")
print(f"Al√©m de ¬±3œÉ (esperado ~0.3%): {outliers_3sigma} ({outliers_3sigma/total*100:.2f}%)")

# Avalia√ß√£o geral
print("\n" + "="*70)
print("‚úÖ AVALIA√á√ÉO GERAL DAS PREMISSAS")
print("="*70)

checks = []
checks.append(("M√©dia dos res√≠duos ‚âà 0", abs(np.mean(residuals)) < 0.1))
checks.append(("Normalidade (Shapiro-Wilk)", p_value > 0.05))
checks.append(("Outliers dentro do esperado", outliers_2sigma/total < 0.10))

for check, passed in checks:
    status = "‚úì" if passed else "‚úó"
    print(f"{status} {check}")

passed_checks = sum([c[1] for c in checks])
print(f"\n{'='*70}")
print(f"Resultado: {passed_checks}/{len(checks)} premissas atendidas")
if passed_checks == len(checks):
    print("üéâ Modelo atende bem √†s premissas da regress√£o linear!")
elif passed_checks >= len(checks) * 0.6:
    print("‚ö†Ô∏è Modelo atende parcialmente √†s premissas. Considere transforma√ß√µes nos dados.")
else:
    print("‚ùå Modelo n√£o atende √†s premissas. Considere outros m√©todos de regress√£o.")
print("="*70)