# Day 8: 회귀분석 (2) - 진단

**날짜**: 2025-07-11

**목표**:
- 잔차 분석
- Q-Q Plot (정규성)
- 잔차 vs 예측값 (등분산성)
- Cook's Distance (영향력)

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats

from utils import *

set_korean_font()

df = pd.read_csv(os.path.join(DATA_PATHS['processed'], 'integrated_data.csv'), encoding='utf-8-sig')
print(f"✅ Day 8: 회귀 진단")

In [None]:
# 모델 재구축
X_cols = ['인구당_방범용', '인구밀도']
y_col = '인구당_CCTV효과범죄율'

X = df[X_cols]
y = df[y_col]
X_with_const = sm.add_constant(X)
model = sm.OLS(y, X_with_const).fit()

# 잔차 계산
residuals = model.resid
fitted = model.fittedvalues

## 1. Q-Q Plot (정규성 검사)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
stats.probplot(residuals, dist="norm", plot=ax)
ax.set_title('Q-Q Plot (잔차 정규성 검사)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(DATA_PATHS['figures'], 'day8_qq_plot.png'), dpi=300)
plt.show()

print("해석: 점들이 대각선에 가까우면 정규성 가정 만족")

## 2. 잔차 vs 예측값 (등분산성)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(fitted, residuals, alpha=0.6, edgecolors='black')
ax.axhline(0, color='red', linestyle='--', linewidth=2)
ax.set_xlabel('예측값')
ax.set_ylabel('잔차')
ax.set_title('잔차 vs 예측값 (등분산성 검사)', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(DATA_PATHS['figures'], 'day8_residuals_fitted.png'), dpi=300)
plt.show()

print("해석: 패턴 없이 무작위로 분포하면 등분산성 가정 만족")

## 3. Cook's Distance (영향력 큰 관측치)

In [None]:
influence = model.get_influence()
cooks_d = influence.cooks_distance[0]

fig, ax = plt.subplots(figsize=(10, 6))
ax.stem(range(len(cooks_d)), cooks_d, markerfmt=',')
ax.set_xlabel('관측치 인덱스')
ax.set_ylabel("Cook's Distance")
ax.set_title("Cook's Distance (영향력 큰 관측치 탐지)", fontsize=14, fontweight='bold')
ax.axhline(4/len(df), color='red', linestyle='--', label='임계값 (4/n)')
ax.legend()
plt.tight_layout()
plt.savefig(os.path.join(DATA_PATHS['figures'], 'day8_cooks_distance.png'), dpi=300)
plt.show()

influential = df[cooks_d > 4/len(df)]['자치구'].tolist()
print(f"영향력 큰 자치구: {influential if influential else '없음'}")

In [None]:
print("✅ Day 8 완료")