In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager, rc
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.metrics import f1_score, confusion_matrix, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import math

# 경고 메시지 무시
warnings.filterwarnings('ignore')

# 폰트 경로 설정
font_path = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
fontprop = font_manager.FontProperties(fname=font_path)

# matplotlib에 폰트 적용
rc('font', family=fontprop.get_name())



In [22]:
# 데이터 로드
trainDF = pd.read_csv('/apps/study_promptengineerings/dacon/real_estate_fraud/train.csv')
testDF = pd.read_csv('/apps/study_promptengineerings/dacon/real_estate_fraud/test.csv')

# ID와 인덱스 매핑을 저장
id_mapping = pd.Series(testDF['ID'].values, index=testDF.index)

# ID 컬럼 제거  
trainDF = trainDF.drop('ID', axis=1)
testDF = testDF.drop('ID', axis=1)

In [28]:
# 결측치 처리
def handle_missing_values(trainDF):
    # 1. 총층
    def fill_total_floor_with_model(trainDF):
        # 학습 데이터와 예측 대상 데이터 분리
        total_floor_train_data = trainDF[trainDF['총층'].notnull()]
        total_floor_apredict_data = trainDF[trainDF['총층'].isnull()]

        # 입력 변수와 타겟 변수 설정
        features = ['총주차대수']  # 추가적인 관련 변수 포함
        target = '총층'

        # 학습 데이터 준비
        X = total_floor_train_data[features]
        y = total_floor_train_data[target]

        # 결측치 데이터 준비
        X_predict = total_floor_apredict_data[features]

        # 모델 학습
        model = RandomForestRegressor(random_state=42, n_estimators=100)
        model.fit(X, y)

        # 예측
        total_floor_apredict_data['총층'] = model.predict(X_predict)

        # 예측값 반영
        trainDF.loc[trainDF['총층'].isnull(), '총층'] = total_floor_apredict_data['총층']
        return trainDF

    # 2. 욕실수
    def fill_bathroom_mode(row):
        if pd.isnull(row['욕실수']):
            mode_value = df[df['방수'] == row['방수']]['욕실수'].mode()
            if len(mode_value) > 0:  # 최빈값이 존재하는 경우
                return mode_value[0]
            else:
                return row['욕실수']  # 최빈값이 없으면 그대로 둠
        return row['욕실수']

    # 3. 해당층 - 총층 중간값(소숫점 올림)
    median_total_floor = trainDF['총층'].median()
    rounded_median = math.ceil(median_total_floor)
    trainDF['해당층'] = trainDF['해당층'].fillna(rounded_median)

    # 4. 전용면적 - 방수, 욕실수 모두 같은 값의 평균값으로 채우기
    trainDF['전용면적'] = trainDF['전용면적'].fillna(
        trainDF.groupby(['방수', '욕실수'])['전용면적'].transform('mean')
    )

    # 5.1 총주차대수 - 주차불가능인 경우 0으로 채우기
    trainDF.loc[trainDF['총주차대수'].isnull() & (trainDF['주차가능여부'] == '불가능'), '총주차대수'] = 0

    # 5.2 총주차대수 - 주차가능인 경우 처리
    condition = trainDF['총주차대수'].isnull() & (trainDF['주차가능여부'] == '가능')
    trainDF['층수_대비_주차대수'] = trainDF.apply(
        lambda row: row['총주차대수'] / row['총층'] if pd.notnull(row['총주차대수']) else None,
        axis=1
    )
    mean_parking_per_floor = trainDF['층수_대비_주차대수'].mean()
    trainDF.loc[condition, '총주차대수'] = trainDF.loc[condition, '총층'] * mean_parking_per_floor
    trainDF = trainDF.drop(columns=['층수_대비_주차대수'])

    return trainDF

# 결측치 처리 적용
trainDF = handle_missing_values(trainDF)

missing_rows = trainDF[trainDF.isnull().any(axis=1)]
print(missing_rows)

missing_ratio = trainDF.isnull().sum() / len(trainDF) * 100
print(missing_ratio)




     매물확인방식          보증금      월세   전용면적  해당층   총층  방향   방수  욕실수 주차가능여부  총주차대수  \
37     현장확인   25000000.0  520000    NaN  6.0  NaN   3  NaN  NaN    불가능    0.0   
76     전화확인   81000000.0   70000    NaN  6.0  NaN   2  NaN  NaN    불가능    0.0   
88     현장확인  151000000.0   50000    NaN  6.0  NaN   7  NaN  NaN    불가능    0.0   
153    현장확인  167000000.0  380000    NaN  6.0  NaN   3  NaN  NaN     가능    1.0   
163    현장확인  128000000.0  360000    NaN  2.0  5.0   4  1.0  NaN     가능    1.0   
339    서류확인  194000000.0  360000    NaN  6.0  NaN   2  NaN  NaN    불가능    0.0   
456    전화확인   65500000.0  100000    NaN  6.0  NaN   5  NaN  NaN     가능    4.0   
515    전화확인  189500000.0  500000    NaN  6.0  NaN   5  NaN  NaN    불가능    0.0   
546    서류확인  116000000.0  180000    NaN  6.0  NaN   0  NaN  NaN    불가능    0.0   
686    현장확인  181500000.0  520000    NaN  6.0  NaN   7  NaN  NaN    불가능    0.0   
952    서류확인  350000000.0  700000    NaN  6.0  NaN   2  NaN  NaN    불가능    0.0   
1664   서류확인  367500000.0   9

In [24]:
# 레이블 인코딩
label_encode_cols = ['중개사무소', '게재일', '제공플랫폼', '방향']
for col in label_encode_cols:
    le = LabelEncoder()
    combined_data = pd.concat([trainDF[col], testDF[col]], axis=0).astype(str)
    le.fit(combined_data)
    trainDF[col] = le.transform(trainDF[col].astype(str))
    testDF[col] = le.transform(testDF[col].astype(str))

# 원-핫 인코딩
one_hot_cols = ['매물확인방식', '주차가능여부']
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
train_encoded = one_hot_encoder.fit_transform(trainDF[one_hot_cols])
test_encoded = one_hot_encoder.transform(testDF[one_hot_cols])

# 원-핫 인코딩된 열 이름 생성
encoded_feature_names = []
for col in one_hot_cols:
    unique_values = trainDF[col].unique()
    encoded_feature_names.extend([f"{col}_{val}" for val in unique_values])

train = pd.concat([trainDF.drop(columns=one_hot_cols), 
                  pd.DataFrame(train_encoded, index=trainDF.index, columns=encoded_feature_names)], axis=1)
test = pd.concat([testDF.drop(columns=one_hot_cols), 
                 pd.DataFrame(test_encoded, index=testDF.index, columns=encoded_feature_names)], axis=1)

In [25]:
# 데이터 분리
X = train.drop(columns=['허위매물여부'])
y = train['허위매물여부']

# SMOTE로 데이터 증강
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(X, y)

# F1 평가 지표 정의
def f1_metric(y_pred, data):
    y_true = data.get_label()
    y_pred_binary = (y_pred > 0.5).astype(int)
    f1 = f1_score(y_true, y_pred_binary)
    return 'f1', f1, True


ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:

# LightGBM 파라미터 설정
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'class_weight': 'balanced',
    'seed': 42
}

# 교차 검증 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_sm))
test_preds = np.zeros(len(test))

# 모델 학습 및 예측
for fold, (train_idx, val_idx) in enumerate(skf.split(X_sm, y_sm)):
    print(f"Fold {fold + 1}")
    X_train, X_val = X_sm.iloc[train_idx], X_sm.iloc[val_idx]
    y_train, y_val = y_sm.iloc[train_idx], y_sm.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        feval=f1_metric,
        num_boost_round=1000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=100)
        ]
    )
    
    oof_preds[val_idx] = model.predict(X_val)
    test_preds += model.predict(test) / skf.n_splits


In [None]:

# 결과 평가
oof_preds_binary = (oof_preds > 0.5).astype(int)
print("OOF F1 Score:", f1_score(y_sm, oof_preds_binary, average='macro'))

# 혼동 행렬 시각화
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_sm, oof_preds_binary)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 제출 파일 생성
test_preds_binary = (test_preds > 0.5).astype(int)
submission = pd.DataFrame({
    'ID': id_mapping[test.index],  # 처리된 데이터의 인덱스에 해당하는 ID 매핑
    '허위매물여부': test_preds_binary
})
submission.to_csv('submission_2.csv', index=False)
print("Submission file saved to 'submission_2.csv'")