In [106]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-real-estate2/sample_submission.csv
/kaggle/input/fake-real-estate2/train.csv
/kaggle/input/fake-real-estate2/test.csv
/kaggle/input/fake-real-estate7/sample_submission.csv
/kaggle/input/fake-real-estate7/train.csv
/kaggle/input/fake-real-estate7/test.csv


In [None]:
import pandas as pd
import numpy as np
# import joblib
from datetime import datetime
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
# from imblearn.over_sampling import SMOTE
# from collections import Counter
# from imblearn.under_sampling import RandomUnderSampler

In [107]:
import kagglehub

# Download latest version
# path = kagglehub.dataset_download("kaiyoo88/fake-real-estate")

path = "/kaggle/input/fake-real-estate7"
print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/fake-real-estate7


In [108]:
# 데이터 로드
train = pd.read_csv(f'{path}/train.csv')

In [109]:
# Feature & Target 설정
x = train.drop(['ID', '허위매물여부'], axis=1)
y = train['허위매물여부']

In [110]:
# 1) 결측치 처리: KNN Imputer 사용 (더 정밀한 방식)
knn_imputer = KNNImputer(n_neighbors=5)  # K=5로 설정하여 결측치 예측
columns_fill_knn = ['해당층', '총층', '전용면적', '방수', '욕실수', '총주차대수']
x[columns_fill_knn] = knn_imputer.fit_transform(x[columns_fill_knn])

In [111]:
## 2) Feature Engineering 추가

def create_new_features(x):
    x = x.copy() 
    
    # 1 단위면적당 가격 (㎡당 가격)
    x['단위면적당가격'] = x['보증금'] / x['전용면적']
    x['단위면적당가격'].fillna(x['단위면적당가격'].median())
    
    # 2 보증금 대비 월세 비율
    x['보증금_월세비율'] = x['보증금'] / (x['월세'] + 1)
    x['보증금_월세비율'].fillna(x['보증금_월세비율'].median())
    
    # 3 층수 비율 (해당층 / 총층)
    x['층수_비율'] = x['해당층'] / x['총층']
    x['층수_비율'].fillna(x['층수_비율'].median())
    
    # 4 게재일 관련 Feature
    x['게재일'] = pd.to_datetime(x['게재일'])
    # x['게재요일'] = x['게재일'].dt.weekday
    x['게재일_연도'] = x['게재일'].dt.year  # 연도
    x['게재일_월'] = x['게재일'].dt.month  # 월
    x['게재일_요일'] = x['게재일'].dt.weekday  # 요일 (0=월요일, 6=일요일)
    x['게재일_경과일'] = (datetime.today() - x['게재일']).dt.days
    # '게재일' 원본 컬럼 제거 (불필요)
    x = x.drop(columns=['게재일'])
    
    # 5 방향 그룹화
    direction_map = {'동향': '동', '서향': '서', '남향': '남', '북향': '북', '남동향': '남', '북동향': '북'}
    x['방향_그룹'] = x['방향'].map(direction_map)
    
    # 6 이상 가격 탐지 Feature
    unit_price_mean = x['단위면적당가격'].mean()
    unit_price_std = x['단위면적당가격'].std()
    x['가격_이상치'] = ((x['단위면적당가격'] - unit_price_mean) / unit_price_std).abs()
    
    # 7 주차 가능 여부 수치 변환
    x['주차가능여부'] = x['주차가능여부'].map({'가능': 1, '불가능': 0})
    
    # 8 월세 + 관리비 총 비용
    x['월세_총비용'] = x['월세'] + x['관리비']
    x['월세_총비용'].fillna(x['월세_총비용'].median())
    
    # 9 관리비 비율 (관리비 / 월세)
    x['관리비_비율'] = x['관리비'] / (x['월세'] + 1)
    x['관리비_비율'].fillna(x['관리비_비율'].median())
    
    # 10 방수 밀집도 (방수 / 전용면적) & 욕실 밀집도 (욕실수 / 전용면적)
    x['방수_밀집도'] = x['방수'] / (x['전용면적'] + 1)
    x['욕실_밀집도'] = x['욕실수'] / (x['전용면적'] + 1)
    
    # 11. 플랫폼별 평균 보증금 / 월세 차이
    플랫폼_보증금평균 = x.groupby('제공플랫폼')['보증금'].transform('mean')
    플랫폼_월세평균 = x.groupby('제공플랫폼')['월세'].transform('mean')
    
    x['제공플랫폼_보증금차이'] = x['보증금'] - 플랫폼_보증금평균
    x['제공플랫폼_월세차이'] = x['월세'] - 플랫폼_월세평균
    
    return x

x = create_new_features(x)

In [87]:
# ## 3) Label Encoding (문자열 데이터를 숫자로 변환)
# label_encode_cols = ['중개사무소', '제공플랫폼', '방향'] # '게재일', '방향_그룹'
# label_encoders = {}
# for col in label_encode_cols:
#     le = LabelEncoder()
#     x[col] = le.fit_transform(x[col].astype(str))
#     label_encoders[col] = le  # 나중에 변환을 위해 저장

In [88]:
# # 4) One-Hot Encoding 적용
# one_hot_cols = ['매물확인방식' ] # '주차가능여부'
# one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# x_encoded = one_hot_encoder.fit_transform(x[one_hot_cols])
# x_encoded_df = pd.DataFrame(x_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=x.index)

# # 기존 데이터와 병합 후 기존 열 삭제
# x = pd.concat([x.drop(columns=one_hot_cols), x_encoded_df], axis=1)

In [89]:
# 5) Train / Validation 분할 (Stratified 방식)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

In [112]:
def add_noise(df, noise_level=0.02): #0.05
    numeric_cols = df.select_dtypes(include=[np.number]).columns  # 숫자형 컬럼만 선택
    df[numeric_cols] = df[numeric_cols] * (1 + noise_level * np.random.randn(*df[numeric_cols].shape))
    return df

print("Before Noise Injection:", len(x_train))
x_train = add_noise(x_train)
print("After Noise Injection:", len(x_train))

Before Noise Injection: 1961
After Noise Injection: 1961


In [91]:
# def augment_minority_class(x_train, y_train, num_augments=2, noise_level=0.05):
#     numeric_cols = x_train.select_dtypes(include=[np.number]).columns
#     x_train_1 = x_train[y_train == 1].copy()  # 소수 클래스(1)만 선택
#     augmented_data = []

#     for _ in range(num_augments):
#         x_aug = x_train_1.copy()
#         x_aug[numeric_cols] = x_aug[numeric_cols] * (1 + noise_level * np.random.randn(*x_aug[numeric_cols].shape))
#         augmented_data.append(x_aug)

#     x_train = pd.concat([x_train] + augmented_data, axis=0).reset_index(drop=True)
#     y_train = np.concatenate([y_train] + [np.ones(len(x_train_1))] * num_augments)  # 레이블 추가

#     return x_train, y_train

# print("Before Minority Augmentation:", len(x_train))
# x_train, y_train = augment_minority_class(x_train, y_train, num_augments=2)
# print("After Minority Augmentation:", len(x_train))

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

# cat_features 정의
cat_features = ['매물확인방식', '중개사무소', '제공플랫폼', '방향', '방향_그룹']

# 🛠 범주형 컬럼 확인 및 누락 처리
for col in cat_features:
    if col not in x.columns:
        print(f"'{col}' 컬럼이 x에 존재하지 않습니다. 기본값 '미확인'으로 추가합니다.")
        x[col] = '미확인'
    x[col] = x[col].astype(str)

for col in cat_features:
    if col not in x_val.columns:
        print(f"'{col}' 컬럼이 x_val에 존재하지 않습니다. 기본값 '미확인'으로 추가합니다.")
        x_val[col] = '미확인'
    x_val[col] = x_val[col].astype(str)

# CatBoost 모델 초기화
cat_model = CatBoostClassifier(
    cat_features=cat_features,
    auto_class_weights="Balanced",  # 클래스 비율 자동 균형화
    verbose=0
)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'iterations': [500, 1000],    # 반복 횟수
    'depth': [6, 8, 10],          # 트리 깊이
    'learning_rate': [0.03, 0.07], # 학습률
    'l2_leaf_reg': [3, 5, 7],      # 정규화 파라미터
}

# Stratified K-Fold 적용
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV 실행
grid_search = GridSearchCV(
    estimator=cat_model,
    param_grid=param_grid,
    cv=skf,                   # ✅ Stratified K-Fold 적용
    scoring='f1_macro',       # ✅ Macro F1 Score 기준
    n_jobs=-1,                # ✅ 병렬 처리
    verbose=1                 # ✅ 진행상황 출력
)

# Grid Search 실행
grid_search.fit(x, y)

# 최적 하이퍼파라미터 및 성능 출력
print("Best parameters:", grid_search.best_params_)
print("Best Macro F1-score:", grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [None]:
print(cat_features)

In [None]:
# 최적 모델 학습 및 평가
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(x_val)

macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Validation Macro F1-score: {macro_f1:.4f}")

In [None]:
# Test 데이터 로드
test = pd.read_csv(f'{path}/test.csv')

In [None]:
# Test 결측값 대체
test[columns_fill_knn] = knn_imputer.transform(test[columns_fill_knn])

In [None]:
# def create_new_features(x): 
#     # 1 단위면적당 가격 (㎡당 가격)
#     x['단위면적당가격'] = x['보증금'] / x['전용면적']
#     x['단위면적당가격'].fillna(x['단위면적당가격'].median(), inplace=True)
    
#     # 2 보증금 대비 월세 비율
#     x['보증금_월세비율'] = x['보증금'] / (x['월세'] + 1)
#     x['보증금_월세비율'].fillna(x['보증금_월세비율'].median(), inplace=True)
    
#     # 3 층수 비율 (해당층 / 총층)
#     x['층수_비율'] = x['해당층'] / x['총층']
#     x['층수_비율'].fillna(x['층수_비율'].median(), inplace=True)
    
#     # 4 게재일 관련 Feature
#     x['게재일'] = pd.to_datetime(x['게재일'])
#     x['게재요일'] = x['게재일'].dt.weekday
#     x['게재일_경과일'] = (datetime(2025, 1, 20) - x['게재일']).dt.days
    
#     # 5 방향 그룹화
#     direction_map = {'동향': '동', '서향': '서', '남향': '남', '북향': '북', '남동향': '남', '북동향': '북'}
#     x['방향_그룹'] = x['방향'].map(direction_map)
    
#     # 6 이상 가격 탐지 Feature
#     unit_price_mean = x['단위면적당가격'].mean()
#     unit_price_std = x['단위면적당가격'].std()
#     x['가격_이상치'] = ((x['단위면적당가격'] - unit_price_mean) / unit_price_std).abs()
    
#     # 7 주차 가능 여부 수치 변환
#     x['주차가능여부'] = x['주차가능여부'].map({'가능': 1, '불가능': 0})
    
#     # 8 월세 + 관리비 총 비용
#     x['월세_총비용'] = x['월세'] + x['관리비']
#     x['월세_총비용'].fillna(x['월세_총비용'].median(), inplace=True)
    
#     # 9 관리비 비율 (관리비 / 월세)
#     x['관리비_비율'] = x['관리비'] / (x['월세'] + 1)
#     x['관리비_비율'].fillna(x['관리비_비율'].median(), inplace=True)
    
#     # 10 방수 밀집도 (방수 / 전용면적) & 욕실 밀집도 (욕실수 / 전용면적)
#     x['방수_밀집도'] = x['방수'] / (x['전용면적'] + 1)
#     x['욕실_밀집도'] = x['욕실수'] / (x['전용면적'] + 1)
    
#     # 11 플랫폼별 평균 보증금 / 월세 차이
#     플랫폼_보증금평균 = train.groupby('제공플랫폼')['보증금'].mean()
#     플랫폼_월세평균 = train.groupby('제공플랫폼')['월세'].mean()
    
#     x['제공플랫폼_보증금차이'] = x['보증금'] - x['제공플랫폼'].map(플랫폼_보증금평균)
#     x['제공플랫폼_월세차이'] = x['월세'] - x['제공플랫폼'].map(플랫폼_월세평균)
#     return x

# test = create_new_features(test)
# test.head()

test = create_new_features(test)

In [None]:
# # Label Encoding 
# for col in label_encode_cols:
#     if col in test.columns:
#         le = label_encoders[col] 
#         test[col] = test[col].astype(str)
#         unseen = set(test[col].unique()) - set(le.classes_) 
#         # unseen = []

#         if unseen: # 뜬금포가 있다
#             le.classes_ = np.append(le.classes_, list(unseen))
#         test[col] = le.transform(test[col].astype(str))

In [None]:
# # One-Hot Encoding
# test_encoded = one_hot_encoder.transform(test[one_hot_cols])
# test_encoded_df = pd.DataFrame(test_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=test.index)

# test = pd.concat([test.drop(columns=one_hot_cols), test_encoded_df], axis=1)

In [None]:
for col in cat_features:
    test[col] = test[col].astype(str)

In [None]:
test.drop(columns=['ID'], inplace=True)

In [None]:
pred = pd.Series(cat_model.predict(test)) #best_model

In [None]:
print('1:', pred.sum(), '| ratio:', (pred.sum()/len(pred)*100))

In [None]:
submit = pd.read_csv(f'{path}/sample_submission.csv')

In [None]:
submit['허위매물여부'] = pred # 우리의 예측 넣는다
submit.head()

In [None]:
submit.to_csv('./baseline_submission_220908.csv',index=False)