In [20]:
import pandas as pd

# 1. 데이터 로드
merged = pd.read_csv("analysis_results/1시간 단위 데이터 분석/merged_hourly_long_utf8sig.csv")
metrics = pd.read_csv("analysis_results/1시간 단위 데이터 분석/metrics_train.csv")
submission_sample = pd.read_csv("analysis_results/1시간 단위 데이터 분석/submission_sample.csv")

In [21]:
# 2. 날짜 정리
merged['date'] = pd.to_datetime(merged['date']).dt.date
metrics['lifelog_date'] = pd.to_datetime(metrics['lifelog_date']).dt.date
submission_sample['lifelog_date'] = pd.to_datetime(submission_sample['lifelog_date']).dt.date

# 3. 학습 데이터 병합
train_df = pd.merge(merged, metrics, left_on=['subject_id', 'date'], right_on=['subject_id', 'lifelog_date'])

In [22]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
# 4. 피처/타겟 분리
target = 'Q1'
drop_cols = ['subject_id', 'date', 'lifelog_date', 'sleep_date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']

X = train_df.drop(columns=drop_cols)
y = train_df[target]

# 5. 결측치 처리
X.fillna(0, inplace=True)

# 6. 문자열 및 한글 피처 제거
X = X.select_dtypes(include=['number'])
X = X.loc[:, ~X.columns.str.contains('[가-힣]', regex=True)]

# 7. 특수문자 세척 함수
def sanitize_column_names(df):
    df.columns = (
        df.columns
        .str.replace(r"[^\w]", "_", regex=True)
        .str.replace(r"__+", "_", regex=True)
        .str.strip("_")
    )
    return df

X = sanitize_column_names(X)

# 8. 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train = X.loc[X_train.index]
X_val = X.loc[X_val.index]

# 9. 모델 학습
model = LGBMClassifier(n_estimators=500, learning_rate=0.03, random_state=42)
model.fit(X_train, y_train)

print("📊 평가 결과")
print(classification_report(y_val, model.predict(X_val)))


[LightGBM] [Info] Number of positive: 4282, number of negative: 4358
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10653
[LightGBM] [Info] Number of data points in the train set: 8640, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495602 -> initscore=-0.017593
[LightGBM] [Info] Start training from score -0.017593
📊 평가 결과
              precision    recall  f1-score   support

           0       0.57      0.81      0.67      1090
           1       0.67      0.38      0.48      1070

    accuracy                           0.60      2160
   macro avg       0.62      0.60      0.58      2160
weighted avg       0.62      0.60      0.58      2160



In [23]:
merged_dates = set(merged['date'].astype(str).unique())
submission_dates = set(submission_sample['lifelog_date'].astype(str).unique())
common_dates = merged_dates & submission_dates

print("📅 병합 가능한 날짜 수:", len(common_dates))
print("예시:", list(common_dates)[:5])


📅 병합 가능한 날짜 수: 104
예시: ['2024-07-23', '2024-08-16', '2024-09-21', '2024-08-31', '2024-08-29']


In [27]:
# 문자열로 변환
merged['date_str'] = pd.to_datetime(merged['date']).astype(str)
submission_sample['lifelog_date_str'] = pd.to_datetime(submission_sample['lifelog_date']).astype(str)

# 병합 가능한 key 직접 추출
merged_keys = set(zip(merged['subject_id'], merged['date_str']))
submission_keys = set(zip(submission_sample['subject_id'], submission_sample['lifelog_date_str']))

common_keys = merged_keys & submission_keys

print("merged subject_ids:", merged['subject_id'].unique()[:5])
print("submission_sample subject_ids:", submission_sample['subject_id'].unique()[:5])

print("🔍 병합 가능한 (subject_id, date) 쌍 개수:", len(common_keys))
print("예시:", list(common_keys)[:5])

print("✅ test_df shape after fixed merge:", test_df.shape)
print("✅ test_X shape:", test_X.shape)
print("✅ test_X columns same as X:", list(set(X.columns) - set(test_X.columns)))


# 1. test_df에서 숫자형 피처만 선택
test_df = test_df.select_dtypes(include=['number'])

# 2. 한글 컬럼 제거
test_df = test_df.loc[:, ~test_df.columns.str.contains('[가-힣]', regex=True)]

# 3. 특수문자 제거
test_df = sanitize_column_names(test_df)

# 4. 누락된 컬럼 추가
missing_cols = set(X.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0

# 5. 순서 정렬
test_df = test_df[X.columns]
test_X = test_df.copy()

# 예측 수행
test_df[target] = model.predict(test_X)

# 12. 제출 파일 생성
submission = submission_sample[['subject_id', 'lifelog_date']].copy()
submission[target] = test_df[target].values

# 13. 저장
submission.to_csv(f"submission_{target}.csv", index=False, encoding='utf-8-sig')
print(f"✅ 저장 완료: submission_{target}.csv")

merged subject_ids: ['id01' 'id02' 'id03' 'id04' 'id05']
submission_sample subject_ids: ['id01' 'id02' 'id03' 'id04' 'id05']
🔍 병합 가능한 (subject_id, date) 쌍 개수: 0
예시: []
✅ test_df shape after fixed merge: (0, 63)
✅ test_X shape: (0, 63)
✅ test_X columns same as X: []


ValueError: Input data must be 2 dimensional and non empty.