In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager, rc
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor
import math

# 경고 메시지 무시
warnings.filterwarnings('ignore')

# 한글 폰트 설정
font_path = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
fontprop = font_manager.FontProperties(fname=font_path)
rc('font', family=fontprop.get_name())


In [3]:

def load_data():
    """
    데이터를 로드하고 ID 매핑을 처리하는 함수
    """
    # 데이터 로드
    train = pd.read_csv('/apps/study_promptengineerings/dacon/real_estate_fraud/train.csv')
    test = pd.read_csv('/apps/study_promptengineerings/dacon/real_estate_fraud/test.csv')
    
    # ID 매핑 저장
    id_mapping = pd.Series(test['ID'].values, index=test.index)
    
    # ID 컬럼 제거
    train = train.drop('ID', axis=1)
    test = test.drop('ID', axis=1)
    
    return train, test, id_mapping


In [None]:

def handle_missing_values(df):
    """
    결측치를 처리하는 함수
    """
    # 1. 총층 처리
    if 'train' in locals():  # 학습 데이터가 있는 경우에만 실행
        floor_model = RandomForestRegressor(random_state=42)
        floor_data = df[df['총층'].notnull()]
        floor_missing = df[df['총층'].isnull()]
        
        floor_model.fit(floor_data[['총주차대수']], floor_data['총층'])
        df.loc[df['총층'].isnull(), '총층'] = floor_model.predict(floor_missing[['총주차대수']])
    
    # 2. 욕실수 처리 - 방수 기준 최빈값
    df['욕실수'] = df.apply(lambda row: df[df['방수'] == row['방수']]['욕실수'].mode()[0] 
                        if pd.isnull(row['욕실수']) else row['욕실수'], axis=1)
    
    # 3. 해당층 처리 - 총층 중간값(소숫점 올림)
    median_floor = math.ceil(df['총층'].median())
    df['해당층'] = df['해당층'].fillna(median_floor)
    
    # 4. 전용면적 처리 - 방수, 욕실수 그룹 평균
    df['전용면적'] = df['전용면적'].fillna(
        df.groupby(['방수', '욕실수'])['전용면적'].transform('mean')
    )
    
    # 5. 총주차대수 처리
    # 5.1 주차불가능 케이스
    df.loc[df['총주차대수'].isnull() & (df['주차가능여부'] == '불가능'), '총주차대수'] = 0
    
    # 5.2 주차가능 케이스
    parking_condition = df['총주차대수'].isnull() & (df['주차가능여부'] == '가능')
    df['층수_대비_주차대수'] = df['총주차대수'] / df['총층']
    mean_parking_ratio = df['층수_대비_주차대수'].mean()
    df.loc[parking_condition, '총주차대수'] = df.loc[parking_condition, '총층'] * mean_parking_ratio
    df = df.drop('층수_대비_주차대수', axis=1)
    
    return df


In [6]:

def encode_features(train, test):
    """
    범주형 변수 인코딩 함수
    """
    # 레이블 인코딩
    label_cols = ['중개사무소', '게재일', '제공플랫폼', '방향']
    for col in label_cols:
        le = LabelEncoder()
        combined_data = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined_data)
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
    
    # 원-핫 인코딩
    onehot_cols = ['매물확인방식', '주차가능여부']
    onehot = OneHotEncoder(sparse=False, handle_unknown='ignore')
    
    # 피처 이름 생성
    feature_names = []
    for col in onehot_cols:
        unique_vals = train[col].unique()
        feature_names.extend([f"{col}_{val}" for val in unique_vals])
    
    # 인코딩 적용
    train_encoded = onehot.fit_transform(train[onehot_cols])
    test_encoded = onehot.transform(test[onehot_cols])
    
    # 데이터프레임 결합
    train = pd.concat([
        train.drop(columns=onehot_cols),
        pd.DataFrame(train_encoded, index=train.index, columns=feature_names)
    ], axis=1)
    
    test = pd.concat([
        test.drop(columns=onehot_cols),
        pd.DataFrame(test_encoded, index=test.index, columns=feature_names)
    ], axis=1)
    
    return train, test

def train_model(X, y, test):
    """
    모델 학습 및 예측 함수
    """
    # SMOTE로 데이터 균형 맞추기
    smote = SMOTE(random_state=42)
    X_sm, y_sm = smote.fit_resample(X, y)
    
    # 모델 파라미터 설정
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'class_weight': 'balanced',
        'seed': 42
    }
    
    # 교차 검증 설정
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X_sm))
    test_preds = np.zeros(len(test))
    
    # 모델 학습 및 예측
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_sm, y_sm)):
        print(f"Training fold {fold + 1}/5")
        
        X_train, X_val = X_sm.iloc[train_idx], X_sm.iloc[val_idx]
        y_train, y_val = y_sm.iloc[train_idx], y_sm.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=100)
            ]
        )
        
        oof_preds[val_idx] = model.predict(X_val)
        test_preds += model.predict(test) / skf.n_splits
    
    return oof_preds, test_preds, y_sm


In [9]:

def evaluate_model(oof_preds, y_sm):
    """
    모델 평가 및 시각화 함수
    """
    # 예측값 이진화
    oof_preds_binary = (oof_preds > 0.5).astype(int)
    
    # F1 점수 계산
    f1 = f1_score(y_sm, oof_preds_binary, average='macro')
    print(f"OOF F1 Score: {f1:.4f}")
    
    # 혼동 행렬 시각화
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_sm, oof_preds_binary)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    
    return f1

def main():
    # 데이터 로드
    train, test, id_mapping = load_data()
    
    # 결측치 처리
    train = handle_missing_values(train, is_train=True)
    test = handle_missing_values(test, is_train=False)
    
    # 피처 인코딩
    train_encoded, test_encoded = encode_features(train, test)
    
    # 학습 데이터 준비
    X = train_encoded.drop('허위매물여부', axis=1)
    y = train_encoded['허위매물여부']
    
    # 모델 학습 및 예측
    oof_preds, test_preds, y_sm = train_model(X, y, test_encoded)
    
    # 모델 평가
    f1 = evaluate_model(oof_preds, y_sm)
    
    # 제출 파일 생성
    test_preds_binary = (test_preds > 0.5).astype(int)
    submission = pd.DataFrame({
        'ID': id_mapping,
        '허위매물여부': test_preds_binary
    })
    submission.to_csv('submission.csv', index=False)
    print("Submission file has been created successfully!")

if __name__ == "__main__":
    main()

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values