In [1]:
# !pip install shap
# !pip install PyMuPDF
# !pip show pandas scikit-learn PyMuPDF
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import shap

Name: pandas
Version: 2.3.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License

 Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
 All rights reserved.

 Copyright (c) 2011-2023, Open source contributors.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 * Neither the name of the copyright holder nor the names of its
   contribut

## 샘플 데이터 만들기 - Feature Engineering

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# --- 1. 데이터 생성을 위한 기본 설정 ---
num_samples = 5000
np.random.seed(42) # 재현성을 위해 시드값 고정

# --- 2. 각 특성별 데이터 생성 ---
# 실거래가 정보 (단위: 만 원) - 분포를 넓히고 일부 고가 매물 추가
prices = np.random.normal(loc=50000, scale=25000, size=num_samples)
high_prices = np.random.normal(loc=150000, scale=50000, size=int(num_samples * 0.05))
prices[:len(high_prices)] = high_prices
real_prices = np.abs(prices).astype(int)
real_prices = np.where(real_prices < 10000, 10000, real_prices)

# 건축물 유형 (다세대, 연립주택 추가)
building_types = np.random.choice(
    ['아파트', '빌라', '오피스텔', '다세대주택', '단독주택', '연립주택', '다가구주택'],
    num_samples,
    p=[0.4, 0.2, 0.15, 0.1, 0.05, 0.05, 0.05]
)

# 불법 건축 여부 (위험 데이터 비중 약간 증가)
is_illegal_build = np.random.choice([0, 1], num_samples, p=[0.85, 0.15])

# 건축물 용도 (Categorical)
building_uses = np.random.choice(['주거용', '근린생활시설', '업무시설'], num_samples, p=[0.8, 0.15, 0.05])

# 용도 위반 여부 (Boolean)
is_use_violation = np.where(building_uses != '주거용', np.random.choice([0, 1], num_samples, p=[0.6, 0.4]), 0)

# 근저당권 개수, 소유자변경 횟수 (Count)
num_mortgages = np.random.choice([0, 1, 2, 3, 4, 5], num_samples, p=[0.4, 0.3, 0.15, 0.1, 0.03, 0.02])
owner_change_dist = [0.5, 0.2, 0.1, 0.1, 0.05, 0.05]
num_owner_changes = np.random.choice([1, 2, 3, 4, 5, 6], num_samples, p=owner_change_dist)

# 채권최고액 (단위: 만 원) - 근저당권 개수에 따라 다르게 설정
max_claim_amount = (num_mortgages * real_prices * np.random.uniform(0.2, 0.5, num_samples) * 1.2).astype(int)
max_claim_amount = np.where(num_mortgages == 0, 0, max_claim_amount)

# 신탁 등기여부, 압류/가압류 등기 여부 (Boolean)
is_trust_registered = np.random.choice([0, 1], num_samples, p=[0.92, 0.08])
is_seizure_registered = np.random.choice([0, 1], num_samples, p=[0.88, 0.12])

# 선순위 채권 존재여부 (Boolean) - 근저당권이나 압류가 있을 경우 존재
has_senior_debt = np.where((num_mortgages > 0) | (is_seizure_registered == 1), 1, 0)

# 전입 가능여부 (Boolean) - 신탁 등기가 있으면 전입 불가능한 경우가 많음
movein_prob = np.ones(num_samples) * 0.95
movein_prob[is_trust_registered == 1] = 0.1
movein_prob[building_uses != '주거용'] = 0.3
is_movein_possible = (np.random.rand(num_samples) < movein_prob).astype(int)

# 우선변제권 여부 (Boolean) - 전입 가능여부가 True이고 다른 선순위 채권이 적을 때 높은 확률
has_priority_right = np.where((is_movein_possible == 1) & (has_senior_debt == 0), 1, 0)

# 등기부 갑구/을구 횟수
num_gapgu = num_owner_changes + np.random.randint(0, 3, num_samples)
num_eulgu = num_mortgages + np.random.randint(0, 2, num_samples)

# 전세가율 (%) - 위험/안전 그룹을 나누어 생성하여 데이터 다양성 확보
safe_ratios = np.random.uniform(low=50, high=80, size=num_samples)
risky_ratios = np.random.uniform(low=80, high=110, size=num_samples)
jeonse_ratio = np.where(np.random.rand(num_samples) > 0.4, safe_ratios, risky_ratios)
jeonse_prices = (real_prices * jeonse_ratio / 100).astype(int)

# 근저당권 설정일 중 최근날짜 (기간 확장)
days_back = np.random.randint(1, 365*5, num_samples)
recent_mortgage_date = pd.to_datetime('today').normalize() - pd.to_timedelta(days_back, unit='D')
recent_mortgage_date = np.where(num_mortgages > 0, recent_mortgage_date.strftime('%Y-%m-%d'), pd.NaT)


# --- 3. 생성된 특성들을 DataFrame으로 합치기 ---
df = pd.DataFrame({
    '실거래가': real_prices,
    '전세가': jeonse_prices,
    '전세가율': jeonse_ratio,
    '건축물_유형': building_types,
    '건축물_용도': building_uses,
    '용도_위반_여부': is_use_violation,
    '불법_건축_여부': is_illegal_build,
    '근저당권_개수': num_mortgages,
    '채권최고액': max_claim_amount,
    '근저당권_설정일_최근': recent_mortgage_date,
    '소유자변경_횟수': num_owner_changes,
    '등기부_갑구_횟수': num_gapgu,
    '등기부_을구_횟수': num_eulgu,
    '신탁_등기여부': is_trust_registered,
    '압류_가압류_등기여부': is_seizure_registered,
    '선순위_채권_존재여부': has_senior_debt,
    '전입_가능여부': is_movein_possible,
    '우선변제권_여부': has_priority_right,
})


# --- 4. '위험여부' 타겟 변수 생성 (위험 점수 기반) ---
risk_score = (
    (df['전세가율'] > 85).astype(int) * 3 +
    ((df['전세가'] + df['채권최고액']) / df['실거래가'] > 1.0).astype(int) * 4 +
    df['압류_가압류_등기여부'] * 5 +
    df['신탁_등기여부'] * 5 +
    (df['소유자변경_횟수'] >= 4).astype(int) * 2 +
    (df['등기부_갑구_횟수'] >= 6).astype(int) * 1 +
    df['불법_건축_여부'] * 2 +
    (df['전입_가능여부'] == 0).astype(int) * 2
)
# 위험 점수가 4점 이상일 경우 위험으로 분류
risk_threshold = 4
df['위험여부'] = (risk_score >= risk_threshold).astype(int)


# --- 5. 최종 데이터프레임 확인 및 CSV 파일로 저장 ---
# '전세가'와 '총부채비율'은 최종 파일에서 제외 (전세가율과 채권최고액으로 충분)
df = df.drop(columns=['전세가'])
df = df.sample(frac=1).reset_index(drop=True) # 데이터 순서 섞기

print("샘플 데이터 생성 완료!")
print("데이터셋 미리보기:")
display(df.head())
print("\n위험/안전 데이터 분포:")
print(df['위험여부'].value_counts())

# CSV 파일로 저장
df.to_csv('real_estate_samples.csv', index=False, encoding='utf-8-sig')
print("\\n'real_estate_samples.csv' 파일이 성공적으로 생성되었습니다.")

샘플 데이터 생성 완료!
데이터셋 미리보기:


Unnamed: 0,실거래가,전세가율,건축물_유형,건축물_용도,용도_위반_여부,불법_건축_여부,근저당권_개수,채권최고액,근저당권_설정일_최근,소유자변경_횟수,등기부_갑구_횟수,등기부_을구_횟수,신탁_등기여부,압류_가압류_등기여부,선순위_채권_존재여부,전입_가능여부,우선변제권_여부,위험여부
0,67037,91.960626,아파트,주거용,0,1,0,0,NaT,1,1,1,0,1,1,1,0,1
1,73154,80.373585,다세대주택,주거용,0,0,1,26872,2022-03-08,1,3,2,0,0,1,1,0,1
2,36433,52.566287,빌라,근린생활시설,1,0,5,81714,2023-03-20,1,2,6,0,0,1,0,0,1
3,84392,86.680282,아파트,주거용,0,0,3,115936,2022-06-04,1,2,4,0,0,1,1,0,1
4,47317,72.205723,빌라,주거용,0,0,1,20478,2024-06-23,1,3,2,0,0,1,1,0,1



위험/안전 데이터 분포:
위험여부
1    3603
0    1397
Name: count, dtype: int64
\n'real_estate_samples.csv' 파일이 성공적으로 생성되었습니다.
