<a href="https://colab.research.google.com/github/kimyeonseo666/hypothesis_1/blob/main/hypothesis_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================================
# 가설 1 (약화된 형태) + anomaly_score 완전 통합
# "시세 대비 과도하게 벗어난 가격 매물은
# 가격 분포 상에서 통계적으로 분리되는 집단을 형성한다"
# ================================

import os
import numpy as np
import pandas as pd
from scipy import stats

# -----------------------------
# 0. 작업 디렉토리 설정
# -----------------------------
os.chdir('/content/drive/MyDrive/Colab Notebooks/hondong')

# -----------------------------
# 1. 데이터 로드
# -----------------------------
df = pd.read_csv("daangn_list_detail_with_missing_anomaly.csv", encoding="utf-8-sig")
df = df.copy()

# -----------------------------
# 2. 가격 지표 생성
# -----------------------------
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df["exclusive_area"] = pd.to_numeric(df["exclusive_area"], errors="coerce")

df["price_per_area"] = df["price"] / df["exclusive_area"]

# -----------------------------
# 3. 지역 단위 시세 (구 단위)
# -----------------------------
df["region"] = df["address"].str.extract(r"(.*?구)")

region_median = df.groupby("region")["price_per_area"].median()
df["region_median_price"] = df["region"].map(region_median)

# -----------------------------
# 4. 지역 대비 가격 편차
# -----------------------------
df["price_deviation"] = (df["price_per_area"] - df["region_median_price"]) / df["region_median_price"]

# -----------------------------
# 5. IQR 기반 이상치 판단
# -----------------------------
Q1 = df["price_deviation"].quantile(0.25)
Q3 = df["price_deviation"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR

df["price_outlier"] = (df["price_deviation"] < lower) | (df["price_deviation"] > upper)

# -----------------------------
# 6. 분포 분리 검정 (Mann–Whitney U)
# -----------------------------
normal = df.loc[~df["price_outlier"], "price_deviation"].dropna()
outlier = df.loc[df["price_outlier"], "price_deviation"].dropna()

if len(normal) > 1 and len(outlier) > 1:
    stat, p = stats.mannwhitneyu(normal, outlier, alternative="two-sided")
else:
    stat, p = np.nan, np.nan

# -----------------------------
# 7. 효과 크기 (Cliff's delta)
# -----------------------------
def cliffs_delta(x, y):
    n_x, n_y = len(x), len(y)
    greater = sum(ix > iy for ix in x for iy in y)
    less = sum(ix < iy for ix in x for iy in y)
    return (greater - less) / (n_x * n_y)

delta = cliffs_delta(outlier.values, normal.values)

# -----------------------------
# 9. 결과 출력
# -----------------------------
print("===== 가격 이상치 분포 분리 검증 =====")
print(f"정상 매물 수: {len(normal)}")
print(f"가격 이상치 매물 수: {len(outlier)}")
print(f"Mann–Whitney U p-value: {p}")
print(f"Cliff’s delta: {delta:.3f}")



===== 가격 이상치 분포 분리 검증 =====
정상 매물 수: 1025
가격 이상치 매물 수: 26
Mann–Whitney U p-value: 2.8460517249133e-18
Cliff’s delta: 1.000


In [None]:
from sklearn.ensemble import IsolationForest
import numpy as np

# -----------------------------
# valid index: price_deviation 계산 가능한 매물
# -----------------------------
valid_idx = df["price_deviation"].notna()

# Isolation Forest 학습용 데이터
X = df.loc[valid_idx, ["price_deviation"]].values

# -----------------------------
# Isolation Forest 모델 생성
# -----------------------------
iso = IsolationForest(
    n_estimators=200,      # 트리 수
    max_samples='auto',    # 전체 데이터 사용
    contamination=0.05,    # 예상 이상치 비율
    random_state=42
)

iso.fit(X)

# -----------------------------
# anomaly score 계산 (0~1, 클수록 이상치)
# -----------------------------
score = iso.decision_function(X)          # -1~1 범위
score_scaled = (score - score.min()) / (score.max() - score.min())
df.loc[valid_idx, "price_anomaly_score"] = 1 - score_scaled  # 이상치일수록 1

# -----------------------------
# 결측치인 매물(price_deviation = NaN)
# -----------------------------
df.loc[~valid_idx, "price_anomaly_score"] = 1  # 완전 이상치로 처리

# -----------------------------
# 결과 확인
# -----------------------------
print(df[["price_per_area", "price_deviation", "price_anomaly_score"]].head(10))


   price_per_area  price_deviation  price_anomaly_score
0    3.846154e+06        -0.434096             0.186574
1    6.555724e+06         0.216906             0.061478
2    5.042864e+06        -0.063918             0.062992
3    8.260670e+06         0.215435             0.057949
4    1.209921e+04        -0.997754             0.271187
5             NaN              NaN             1.000000
6    6.654567e+06         0.466223             0.132456
7    1.399795e+07         1.598370             0.649470
8    2.791996e+04        -0.993848             0.312129
9             NaN              NaN             1.000000


In [None]:
# 데이터 저장

# 필요 없는 컬럼 삭제
drop_cols = ["price_per_area", "region", "region_median_price", "price_deviation", "price_outlier"]
df_original = df.drop(columns=drop_cols, errors='ignore')

# CSV로 저장
df_original.to_csv("daangn_list_detail_with_price_anomaly_score.csv", index=False, encoding="utf-8-sig")

df_original.head(10)

Unnamed: 0,area,identifier,description,image_count,image,building_name,building_usage,exclusive_area,floor,direction,...,built_year,total_floor,price,address,register_date,image_anomaly_score,text_anomaly_score,build_anomaly_score,missing_anomaly_score,price_anomaly_score
0,남현,https://www.daangn.com/kr/realty/%ED%88%AC%EB%...,방배동 신축급 투룸 전세 (1억 5천)관리비 5만 원(수도 포함) / 인덕션 1구 ...,5,https://img.kr.gcp-karroter.net/realty/article...,,단독주택,39.0,2.0,남향,...,1983-07-08,,150000000.0,서울특별시 서초구 청두곶13길 14-3 (방배동),2025-11-05,0.222264,0.332791,0.426201,0.021265,0.186574
1,남현,https://www.daangn.com/kr/realty/%EC%98%A4%ED%...,안전한 매물만 보여드립니다(●&#39;◡&#39;●)“전입신고 및 확정일자” 가능ㅡ...,6,https://img.kr.gcp-karroter.net/realty/article...,,단독주택,19.83,3.0,남서향,...,2020-03-19,4.0,130000000.0,서울특별시 동작구 사당로28길 55 (사당동),2025-11-09,0.14222,0.161512,0.108086,0.00696,0.061478
2,남현,https://www.daangn.com/kr/realty/%EC%98%A4%ED%...,"⭕전입신고, 확정일자 , 임대차신고 가능! ⭕ 안되는 곳은 거래하지않으니, 안심하...",4,https://img.kr.gcp-karroter.net/realty/article...,,단독주택,19.83,1.0,동향,...,2017-11-22,3.0,100000000.0,서울특별시 동작구 사당로20길 85 (사당동),2025-11-09,0.169314,0.215661,0.018252,0.00696,0.062992
3,남현,https://www.daangn.com/kr/realty/%EC%98%A4%ED%...,안전하고 깨끗하고 따듯한 집을 찾으신다면 여기입니다!사는동안 굉장히 만족스러웠습니다...,12,https://img.kr.gcp-karroter.net/realty/article...,,공동주택,21.79,2.0,북서향,...,2016-12-08,5.0,180000000.0,서울특별시 서초구 서초대로33길 55 (방배동),2025-10-23,0.035081,0.262578,0.273456,0.00696,0.057949
4,남현,https://www.daangn.com/kr/realty/%EB%B6%84%EB%...,안전한 매물만 보여드립니다(●&#39;◡&#39;●)“전입신고 및 확정일자” 가능ㅡ...,8,https://img.kr.gcp-karroter.net/realty/article...,,제2종 근린생활시설,33.06,2.0,남서향,...,2017-05-15,5.0,400000.0,서울특별시 동작구 사당로22나길 6 (사당동),2025-11-09,0.296171,0.178461,0.451611,0.00696,0.271187
5,남현,https://www.daangn.com/kr/realty/%EC%98%A4%ED%...,개인 테라스 및 복도공간이 있어 짐이 많으신 분들도 걱정없습니다! 지정주차 사용중이...,14,https://img.kr.gcp-karroter.net/realty/article...,THE AGIT,단독주택,,4.0,서향,...,2016-04-27,4.0,600000.0,서울특별시 동작구 동작대로9길 68-7 (사당동),2025-11-09,0.222698,0.159346,0.084473,0.456434,1.0
6,남현,https://www.daangn.com/kr/realty/%EB%B6%84%EB%...,"⭕전입신고, 확정일자 , 임대차신고 가능! ⭕ 안되는 곳은 거래하지않으니, 안심하...",6,https://img.kr.gcp-karroter.net/realty/article...,,단독주택,33.06,1.0,남향,...,2016-10-13,3.0,220000000.0,서울특별시 관악구 남현길 24-1 (남현동),2025-11-05,0.3349,0.211654,0.165837,0.00696,0.132456
7,남현,https://www.daangn.com/kr/realty/%ED%88%AC%EB%...,"MBC-구해줘! 홈즈 출연 부동산 중개 업무 외에도 부동산 연구, 개발 등의 전문경...",20,https://img.kr.gcp-karroter.net/realty/article...,남성역 Central View,공동주택,29.29,5.0,남동향,...,2024-02-05,,410000000.0,"서울특별시 동작구 사당로16나길 55 (사당동, 남성역 Central View)",2025-10-26,0.123933,0.124432,0.751655,0.139379,0.64947
8,남현,https://www.daangn.com/kr/realty/%EC%98%A4%ED%...,안전한 매물만 보여드립니다(●&#39;◡&#39;●)“전입신고 및 확정일자” 가능ㅡ...,7,https://img.kr.gcp-karroter.net/realty/article...,우리빌남현,단독주택,21.49,4.0,남동향,...,2020-08-05,4.0,600000.0,서울특별시 관악구 남부순환로256길 13-11 (남현동),2025-11-08,0.128567,0.094813,0.07623,0.0,0.312129
9,남현,https://www.daangn.com/kr/realty/%ED%88%AC%EB%...,"1년 전 부분 리모델링한 깨끗한 원거실 투룸입니다.(싱크대, 인덕션, 벽지, 보일러...",8,https://img.kr.gcp-karroter.net/realty/article...,,공동주택,,5.0,남향,...,2011-03-18,,340000000.0,서울특별시 서초구 서초대로23길 96-4 (방배동),2025-10-20,0.067636,0.307865,0.384175,0.477699,1.0
