In [1]:
# !pip install shap
# !pip install PyMuPDF
# !pip show pandas scikit-learn PyMuPDF
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import shap

Name: pandas
Version: 2.3.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License

 Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
 All rights reserved.

 Copyright (c) 2011-2023, Open source contributors.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 * Neither the name of the copyright holder nor the names of its
   contribut

## 샘플 데이터 만들기 - Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# --- 1. 데이터 생성을 위한 기본 설정 ---
num_samples = 5000
np.random.seed(42) # 재현성을 위해 시드값 고정

# --- 2. 최종 16개 피처에 맞춰 데이터 생성 ---

# '과거_매매가' (단위: 원) - 분포를 넓히고 일부 고가 매물 추가
prices = np.random.normal(loc=500000000, scale=250000000, size=num_samples)
high_prices = np.random.normal(loc=1500000000, scale=500000000, size=int(num_samples * 0.05))
prices[:len(high_prices)] = high_prices
past_sales_price = np.abs(prices).astype(int)
past_sales_price = np.where(past_sales_price < 100000000, 100000000, past_sales_price)

# '건축물_유형'
building_types = np.random.choice(
    ['아파트', '빌라', '오피스텔', '다세대주택', '단독주택', '연립주택', '다가구주택'],
    num_samples,
    p=[0.4, 0.2, 0.15, 0.1, 0.05, 0.05, 0.05]
)

# '근저당권_개수'
num_mortgages = np.random.choice([0, 1, 2, 3, 4, 5], num_samples, p=[0.4, 0.3, 0.15, 0.1, 0.03, 0.02])

# '채권최고액' (단위: 원) - 과거_매매가의 20% ~ 50% 사이에서 설정
max_claim_amount = (num_mortgages * past_sales_price * np.random.uniform(0.2, 0.5, num_samples) * 1.2).astype(int)
max_claim_amount = np.where(num_mortgages == 0, 0, max_claim_amount)

# '근저당권_설정일_최근'
days_back = np.random.randint(1, 365*5, num_samples)
recent_mortgage_date = pd.to_datetime('today').normalize() - pd.to_timedelta(days_back, unit='D')
recent_mortgage_date = np.where(num_mortgages > 0, recent_mortgage_date.strftime('%Y-%m-%d'), pd.NaT)

# '신탁_등기여부'
is_trust_registered = np.random.choice([False, True], num_samples, p=[0.92, 0.08])

# '압류_가압류_개수' (단순화를 위해 0 또는 1로 생성)
num_seizures = np.random.choice([0, 1], num_samples, p=[0.88, 0.12])

# '선순위_채권_존재여부' - 근저당권이나 압류가 있을 경우 True
has_senior_debt = np.where((num_mortgages > 0) | (num_seizures > 0), True, False)

# '전입_가능여부' - 신탁 등기가 있으면 전입 불가능한 경우가 많음
movein_prob = np.ones(num_samples) * 0.98
movein_prob[is_trust_registered == 1] = 0.1 # 신탁등기 시 전입 가능 확률 10%
is_movein_possible = (np.random.rand(num_samples) < movein_prob)

# '우선변제권_여부' - 전입 가능하고, 선순위 채권이 없을 때 True
has_priority_right = np.where((is_movein_possible == True) & (has_senior_debt == False), True, False)

# '과거_전세가율' (%) 및 '과거_전세가' (원)
safe_ratios = np.random.uniform(low=50, high=80, size=num_samples)
risky_ratios = np.random.uniform(low=80, high=110, size=num_samples)
past_jeonse_ratio = np.where(np.random.rand(num_samples) > 0.4, safe_ratios, risky_ratios)
past_jeonse_price = (past_sales_price * past_jeonse_ratio / 100).astype(int)


# --- 3. 생성된 특성들을 DataFrame으로 합치기 ---
df = pd.DataFrame({
    # PDF에서 추출하는 12개 피처
    '건축물_유형': building_types,
    '근저당권_개수': num_mortgages,
    '채권최고액': max_claim_amount,
    '근저당권_설정일_최근': recent_mortgage_date,
    '신탁_등기여부': is_trust_registered,
    '압류_가압류_개수': num_seizures,
    '선순위_채권_존재여부': has_senior_debt,
    '전입_가능여부': is_movein_possible,
    '우선변제권_여부': has_priority_right,
    '과거_매매가': past_sales_price,
    '과거_전세가': past_jeonse_price,
    '과거_전세가율': past_jeonse_ratio.round(2).astype(str) + '%',

    # 추후 크롤링으로 채워질 4개 피처 (현재는 빈 값)
    '주소': None,
    '전세가': None,
    '매매가': None,
    '전세가율': None,
})


# --- 4. '위험도' 타겟 변수 생성 (새로운 피처 기준) ---
risk_score = (
    (past_jeonse_ratio > 85).astype(int) * 3 +
    ((df['과거_전세가'] + df['채권최고액']) / df['과거_매매가'] > 1.0).astype(int) * 4 +
    (df['압류_가압류_개수'] > 0).astype(int) * 5 +
    df['신탁_등기여부'] * 5 +
    (df['전입_가능여부'] == False).astype(int) * 2
)
# 위험 점수가 4점 이상일 경우 '위험'으로 분류
risk_threshold = 4
df['위험도'] = (risk_score >= risk_threshold).astype(int)


# --- 5. 최종 데이터프레임 확인 및 CSV 파일로 저장 ---
df = df.sample(frac=1).reset_index(drop=True) # 데이터 순서 섞기

print("샘플 데이터 생성 완료!")
print("데이터셋 미리보기:")
print(df.head())
print("\n최종 컬럼 목록:")
print(df.columns.tolist())
print("\n위험/안전 데이터 분포:")
print(df['위험도'].value_counts())

# CSV 파일로 저장
df.to_csv('data/real_estate_samples.csv', index=False, encoding='utf-8-sig')
print("\n'data/real_estate_samples.csv' 파일이 성공적으로 생성되었습니다.")

샘플 데이터 생성 완료!
데이터셋 미리보기:
  건축물_유형  근저당권_개수      채권최고액 근저당권_설정일_최근  신탁_등기여부  압류_가압류_개수  선순위_채권_존재여부  \
0  다세대주택        2  285606629  2023-01-28    False          0         True   
1     빌라        2  423423282  2023-12-05    False          0         True   
2   단독주택        1  400283617  2025-05-09    False          0         True   
3   연립주택        0          0         NaT    False          0        False   
4    아파트        0          0         NaT    False          0        False   

   전입_가능여부  우선변제권_여부      과거_매매가     과거_전세가 과거_전세가율    주소   전세가   매매가  전세가율  \
0     True     False   422879302  283291651  66.99%  None  None  None  None   
1     True     False   493173865  319503239  64.79%  None  None  None  None   
2     True     False  1155198272  897859772  77.72%  None  None  None  None   
3     True      True   897645518  458541572  51.08%  None  None  None  None   
4     True      True   731158420  701231375  95.91%  None  None  None  None   

   위험도  
0    1  
1    1  
2    1  
3