In [2]:
!pip install pyarrow
!pip install fastparquet



In [3]:
import numpy as np
import pandas as pd
import glob
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
import ast

import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [4]:
# seed 고정
SD = 42
random.seed(SD)
np.random.seed(SD)
os.environ['PYTHONHASHSEED'] = str(SD)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# 파일 경로 설정
data_dir = '/content/drive/MyDrive/ETRI/ETRI_lifelog_dataset/ch2025_data_items'

# Parquet 파일 전체 경로 리스트
parquet_files = glob.glob(os.path.join(data_dir, 'ch2025_*.parquet'))

In [7]:
!ls /content/drive/MyDrive/ETRI/ETRI_lifelog_dataset/ch2025_data_items

ch2025_mACStatus.parquet  ch2025_mGps.parquet		ch2025_mWifi.parquet
ch2025_mActivity.parquet  ch2025_mLight.parquet		ch2025_wHr.parquet
ch2025_mAmbience.parquet  ch2025_mScreenStatus.parquet	ch2025_wLight.parquet
ch2025_mBle.parquet	  ch2025_mUsageStats.parquet	ch2025_wPedo.parquet


In [8]:
file_path = '/content/drive/MyDrive/ETRI/ETRI_lifelog_dataset/ch2025_data_items/ch2025_mACStatus.parquet'
df = pd.read_parquet(file_path)
# 기본 정보 출력
print("📄 Shape:", df.shape)
print("\n📌 Columns and types:")
print(df.dtypes)

print("\n🧼 Null 비율:")
print(df.isnull().mean())

print("\n🔍 샘플 데이터:")
display(df.head())

📄 Shape: (939896, 3)

📌 Columns and types:
subject_id            object
timestamp     datetime64[ns]
m_charging             int64
dtype: object

🧼 Null 비율:
subject_id    0.0
timestamp     0.0
m_charging    0.0
dtype: float64

🔍 샘플 데이터:


Unnamed: 0,subject_id,timestamp,m_charging
0,id01,2024-06-26 12:03:00,0
1,id01,2024-06-26 12:04:00,0
2,id01,2024-06-26 12:05:00,0
3,id01,2024-06-26 12:06:00,0
4,id01,2024-06-26 12:07:00,0


In [9]:
# 파일 이름을 키로, DataFrame을 값으로 저장할 딕셔너리
lifelog_data = {}

# 파일별로 읽기
for file_path in parquet_files:
    name = os.path.basename(file_path).replace('.parquet', '').replace('ch2025_', '')
    lifelog_data[name] = pd.read_parquet(file_path)
    print(f"✅ Loaded: {name}, shape = {lifelog_data[name].shape}")

✅ Loaded: mACStatus, shape = (939896, 3)
✅ Loaded: mActivity, shape = (961062, 3)
✅ Loaded: mAmbience, shape = (476577, 3)
✅ Loaded: mBle, shape = (21830, 3)
✅ Loaded: mGps, shape = (800611, 3)
✅ Loaded: mScreenStatus, shape = (939653, 3)
✅ Loaded: mWifi, shape = (76336, 3)
✅ Loaded: wHr, shape = (382918, 3)
✅ Loaded: wLight, shape = (633741, 3)
✅ Loaded: wPedo, shape = (748100, 9)
✅ Loaded: mLight, shape = (96258, 3)
✅ Loaded: mUsageStats, shape = (45197, 3)


In [10]:
# 딕셔너리에 있는 모든 항목을 독립적인 변수로 할당
for key, df in lifelog_data.items():
    globals()[f"{key}_df"] = df

In [11]:
# lifelog_date가 timestamp랑 같다
metrics_train = pd.read_csv('/content/drive/MyDrive/ETRI/ETRI_lifelog_dataset/ch2025_metrics_train.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/ETRI/ETRI_lifelog_dataset/ch2025_submission_sample.csv')

metrics_train.head()
print('-----------------')
sample_submission.head()

-----------------


Unnamed: 0,subject_id,sleep_date,lifelog_date,Q1,Q2,Q3,S1,S2,S3
0,id01,2024-07-31,2024-07-30,0,0,0,0,0,0
1,id01,2024-08-01,2024-07-31,0,0,0,0,0,0
2,id01,2024-08-02,2024-08-01,0,0,0,0,0,0
3,id01,2024-08-03,2024-08-02,0,0,0,0,0,0
4,id01,2024-08-04,2024-08-03,0,0,0,0,0,0


In [12]:
# ✅ 기준 쌍 (subject_id, lifelog_date)
sample_submission['lifelog_date'] = pd.to_datetime(sample_submission['lifelog_date'])
test_keys = set(zip(sample_submission['subject_id'], sample_submission['lifelog_date'].dt.date))

# ✅ DataFrame 별 timestamp 컬럼 수동 지정
dataframes = {
    'mACStatus': (mACStatus_df, 'timestamp'),
    'mActivity': (mActivity_df, 'timestamp'),
    'mAmbience': (mAmbience_df, 'timestamp'),
    'mBle': (mBle_df, 'timestamp'),
    'mGps': (mGps_df, 'timestamp'),
    'mLight': (mLight_df, 'timestamp'),
    'mScreenStatus': (mScreenStatus_df, 'timestamp'),
    'mUsageStats': (mUsageStats_df, 'timestamp'),
    'mWifi': (mWifi_df, 'timestamp'),
    'wHr': (wHr_df, 'timestamp'),
    'wLight': (wLight_df, 'timestamp'),
    'wPedo': (wPedo_df, 'timestamp'),
}

# ✅ 분리 함수
# test_df = 제출 대상 + 날짜와 일치하는 데이터만 포함
# train_df = 나머지 모든 데이터 ( 모델학습에 사용될 것 )
def split_test_train_hourly(df, subject_col='subject_id', timestamp_col='timestamp'):
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
    df = df.dropna(subset=[timestamp_col])
    df['date'] = df[timestamp_col].dt.date
    df['hour'] = df[timestamp_col].dt.hour
    df['key'] = list(zip(df[subject_col], df['date']))

    test_df = df[df['key'].isin(test_keys)].drop(columns=['key'])
    train_df = df[~df['key'].isin(test_keys)].drop(columns=['key'])

    return test_df, train_df

# ✅ 결과 저장
# 모든 df 반복하면서 train/test 별도 변수 저장
for name, (df, ts_col) in dataframes.items():
    print(f"⏳ {name} 시간 단위 분리 중...")
    test_df, train_df = split_test_train_hourly(df.copy(), subject_col='subject_id', timestamp_col=ts_col)
    globals()[f"{name}_test_hourly"] = test_df
    globals()[f"{name}_train_hourly"] = train_df
    print(f"✅ {name}_test_hourly → {test_df.shape}, {name}_train_hourly → {train_df.shape}")


⏳ mACStatus 시간 단위 분리 중...
✅ mACStatus_test_hourly → (335849, 5), mACStatus_train_hourly → (604047, 5)
⏳ mActivity 시간 단위 분리 중...
✅ mActivity_test_hourly → (343579, 5), mActivity_train_hourly → (617483, 5)
⏳ mAmbience 시간 단위 분리 중...
✅ mAmbience_test_hourly → (170453, 5), mAmbience_train_hourly → (306124, 5)
⏳ mBle 시간 단위 분리 중...
✅ mBle_test_hourly → (8140, 5), mBle_train_hourly → (13690, 5)
⏳ mGps 시간 단위 분리 중...
✅ mGps_test_hourly → (287386, 5), mGps_train_hourly → (513225, 5)
⏳ mLight 시간 단위 분리 중...
✅ mLight_test_hourly → (34439, 5), mLight_train_hourly → (61819, 5)
⏳ mScreenStatus 시간 단위 분리 중...
✅ mScreenStatus_test_hourly → (336160, 5), mScreenStatus_train_hourly → (603493, 5)
⏳ mUsageStats 시간 단위 분리 중...
✅ mUsageStats_test_hourly → (16499, 5), mUsageStats_train_hourly → (28698, 5)
⏳ mWifi 시간 단위 분리 중...
✅ mWifi_test_hourly → (27467, 5), mWifi_train_hourly → (48869, 5)
⏳ wHr 시간 단위 분리 중...
✅ wHr_test_hourly → (143311, 5), wHr_train_hourly → (239607, 5)
⏳ wLight 시간 단위 분리 중...
✅ wLight_test_hou

1분단위 -> 1시간단위

In [13]:
def process_mACStatus_hourly_wide(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour
    df = df.sort_values(['subject_id', 'timestamp'])

    features = []

    for metric in ['charging_ratio', 'charging_transitions', 'avg_charging_duration', 'max_charging_duration']:
        features.extend([f"{metric}_{h:02d}" for h in range(24)])

    # 결과 저장용 딕셔너리
    result_dict = {}

    for (subj, date, hour), group in df.groupby(['subject_id', 'date', 'hour']):
        status = group['m_charging'].values

        ratio_charging = status.mean()
        transitions = (status[1:] != status[:-1]).sum()

        lengths = []
        current_len = 0
        for val in status:
            if val == 1:
                current_len += 1
            elif current_len > 0:
                lengths.append(current_len)
                current_len = 0
        if current_len > 0:
            lengths.append(current_len)

        avg_charging_duration = np.mean(lengths) if lengths else 0
        max_charging_duration = np.max(lengths) if lengths else 0

        key = (subj, date)
        if key not in result_dict:
            result_dict[key] = {}

        result_dict[key][f"charging_ratio_{hour:02d}"] = ratio_charging
        result_dict[key][f"charging_transitions_{hour:02d}"] = transitions
        result_dict[key][f"avg_charging_duration_{hour:02d}"] = avg_charging_duration
        result_dict[key][f"max_charging_duration_{hour:02d}"] = max_charging_duration

    # 딕셔너리를 DataFrame으로 변환
    rows = []
    for (subj, date), feats in result_dict.items():
        row = {'subject_id': subj, 'date': date}
        row.update(feats)
        rows.append(row)

    return pd.DataFrame(rows)
mACStatus_hourly_wide_df = process_mACStatus_hourly_wide(mACStatus_train_hourly)



In [14]:
def process_mActivity_hourly_wide(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    result_dict = {}

    for (subj, date, hour), group in df.groupby(['subject_id', 'date', 'hour']):
        counts = group['m_activity'].value_counts(normalize=True)
        row_key = (subj, date)
        if row_key not in result_dict:
            result_dict[row_key] = {}

        # 각 활동 코드 0~8 비율 저장 (예: activity_3_ratio_14 → 14시는 3번 활동이 60%)
        for i in range(9):
            result_dict[row_key][f'activity_{i}_ratio_{hour:02d}'] = counts.get(i, 0)

    # 딕셔너리를 데이터프레임으로 변환
    rows = []
    for (subj, date), feats in result_dict.items():
        row = {'subject_id': subj, 'date': date}
        row.update(feats)
        rows.append(row)

    return pd.DataFrame(rows)
mActivity_hourly_wide_df = process_mActivity_hourly_wide(mActivity_train_hourly)



In [15]:
import ast

top_10_labels = [
    "Inside, small room", "Speech", "Silence", "Music",
    "Narration, monologue", "Child speech, kid speaking",
    "Conversation", "Speech synthesizer", "Shout", "Babbling"
]

def process_mAmbience_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    # 초기화
    for label in top_10_labels + ['others']:
        df[label] = 0.0

    for idx, row in df.iterrows():
        parsed = ast.literal_eval(row['m_ambience']) if isinstance(row['m_ambience'], str) else row['m_ambience']
        others_prob = 0.0
        for label, prob in parsed:
            prob = float(prob)
            if label in top_10_labels:
                df.at[idx, label] = prob
            else:
                others_prob += prob
        df.at[idx, 'others'] = others_prob

    # 요약: subject_id + date + hour 단위로 평균값
    prob_cols = top_10_labels + ['others']
    summary = df.groupby(['subject_id', 'date', 'hour'])[prob_cols].mean().reset_index()

    # wide pivot 형식으로 변환
    wide_df = summary.pivot(index=['subject_id', 'date'], columns='hour', values=prob_cols)
    wide_df.columns = [f"{col}_{hour:02d}" for col, hour in wide_df.columns]
    wide_df = wide_df.reset_index()

    return wide_df
mAmbience_hourly_wide_df = process_mAmbience_hourly_wide(mAmbience_train_hourly)


In [16]:
import numpy as np
import pandas as pd
import ast

def process_mBle_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    # BLE feature 추출
    features = []

    for idx, row in df.iterrows():
        entry = ast.literal_eval(row['m_ble']) if isinstance(row['m_ble'], str) else row['m_ble']

        rssi_list = []
        class_0_cnt = 0
        class_other_cnt = 0

        for device in entry:
            try:
                rssi = int(device['rssi'])
                rssi_list.append(rssi)
                if str(device['device_class']) == '0':
                    class_0_cnt += 1
                else:
                    class_other_cnt += 1
            except:
                continue

        feature = {
            'subject_id': row['subject_id'],
            'date': row['date'],
            'hour': row['hour'],
            'device_class_0_cnt': class_0_cnt,
            'device_class_others_cnt': class_other_cnt,
            'rssi_mean': np.mean(rssi_list) if rssi_list else np.nan,
            'rssi_min': np.min(rssi_list) if rssi_list else np.nan,
            'rssi_max': np.max(rssi_list) if rssi_list else np.nan,
        }
        features.append(feature)

    df_feat = pd.DataFrame(features)

    # 시간 단위로 집계
    agg = df_feat.groupby(['subject_id', 'date', 'hour']).agg({
        'device_class_0_cnt': 'sum',
        'device_class_others_cnt': 'sum',
        'rssi_mean': 'mean',
        'rssi_min': 'min',
        'rssi_max': 'max',
    }).reset_index()

    # 비율 계산
    total = agg['device_class_0_cnt'] + agg['device_class_others_cnt']
    agg['device_class_0_ratio'] = agg['device_class_0_cnt'] / total.replace(0, np.nan)
    agg['device_class_others_ratio'] = agg['device_class_others_cnt'] / total.replace(0, np.nan)

    agg.drop(columns=['device_class_0_cnt', 'device_class_others_cnt'], inplace=True)

    # wide 형식으로 pivot
    wide = agg.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f'{metric}_{hour:02d}' for metric, hour in wide.columns]
    wide = wide.reset_index()

    return wide
mBle_hourly_wide_df = process_mBle_hourly_wide(mBle_train_hourly)


In [17]:
import numpy as np
import pandas as pd
import ast

def process_mGps_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    features = []

    for idx, row in df.iterrows():
        gps_list = ast.literal_eval(row['m_gps']) if isinstance(row['m_gps'], str) else row['m_gps']

        altitudes = []
        latitudes = []
        longitudes = []
        speeds = []

        for entry in gps_list:
            try:
                altitudes.append(float(entry['altitude']))
                latitudes.append(float(entry['latitude']))
                longitudes.append(float(entry['longitude']))
                speeds.append(float(entry['speed']))
            except:
                continue

        features.append({
            'subject_id': row['subject_id'],
            'date': row['date'],
            'hour': row['hour'],
            'altitude_mean': np.mean(altitudes) if altitudes else np.nan,
            'latitude_std': np.std(latitudes) if latitudes else np.nan,
            'longitude_std': np.std(longitudes) if longitudes else np.nan,
            'speed_mean': np.mean(speeds) if speeds else np.nan,
            'speed_max': np.max(speeds) if speeds else np.nan,
            'speed_std': np.std(speeds) if speeds else np.nan,
        })

    df_feat = pd.DataFrame(features)

    # 시간 단위로 집계 (groupby + agg)
    agg = df_feat.groupby(['subject_id', 'date', 'hour']).agg({
        'altitude_mean': 'mean',
        'latitude_std': 'mean',
        'longitude_std': 'mean',
        'speed_mean': 'mean',
        'speed_max': 'max',
        'speed_std': 'mean',
    }).reset_index()

    # wide pivot
    wide = agg.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f'{metric}_{hour:02d}' for metric, hour in wide.columns]
    wide = wide.reset_index()

    return wide
mGps_hourly_wide_df = process_mGps_hourly_wide(mGps_train_hourly)


주의: 기존 light_day_mean, light_night_mean과 같은 시간대 분할은 제외됨
왜냐하면 시간 단위로 나누면 이미 밤/낮 여부는 hour별 컬럼으로 표현되기 때문입니다.

필요하면 이후에 22~05시만 모아서 light_night_total_mean을 별도로 만들 수도 있습니다.

In [18]:
def process_mLight_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    # 시간 단위 요약
    agg = df.groupby(['subject_id', 'date', 'hour']).agg(
        m_light_mean=('m_light', 'mean'),
        m_light_std=('m_light', 'std'),
        m_light_max=('m_light', 'max'),
        m_light_min=('m_light', 'min')
    ).reset_index()

    # wide pivot
    wide = agg.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f'{metric}_{hour:02d}' for metric, hour in wide.columns]
    wide = wide.reset_index()

    return wide
mLight_hourly_wide_df = process_mLight_hourly_wide(mLight_train_hourly)


In [19]:
def process_mScreenStatus_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    results = []

    for (subj, date, hour), group in df.groupby(['subject_id', 'date', 'hour']):
        status = group['m_screen_use'].values

        ratio_on = status.mean()
        transitions = (status[1:] != status[:-1]).sum()

        durations = []
        current = 0
        for val in status:
            if val == 1:
                current += 1
            elif current > 0:
                durations.append(current)
                current = 0
        if current > 0:
            durations.append(current)

        results.append({
            'subject_id': subj,
            'date': date,
            'hour': hour,
            'screen_on_ratio': ratio_on,
            'screen_on_transitions': transitions,
            'screen_on_duration_avg': np.mean(durations) if durations else 0,
            'screen_on_duration_max': np.max(durations) if durations else 0,
        })

    df_hourly = pd.DataFrame(results)

    # wide 포맷으로 변환
    wide = df_hourly.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f'{metric}_{hour:02d}' for metric, hour in wide.columns]
    wide = wide.reset_index()

    return wide
mScreenStatus_hourly_wide_df = process_mScreenStatus_hourly_wide(mScreenStatus_train_hourly)


In [20]:
import ast
import numpy as np
import pandas as pd

top_apps = [
    'One UI 홈', '카카오톡', '시스템 UI', 'NAVER', '캐시워크', '성경일독Q',
    'YouTube', '통화', '메시지', '타임스프레드', 'Instagram'
]

def process_mUsageStats_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    rows = []

    for (subj, date, hour), group in df.groupby(['subject_id', 'date', 'hour']):
        app_time = {app: 0 for app in top_apps}
        others_time = 0

        for row in group['m_usage_stats']:
            parsed = ast.literal_eval(row) if isinstance(row, str) else row
            for entry in parsed:
                app = entry.get('app_name')
                time = entry.get('total_time', 0)
                if app in top_apps:
                    app_time[app] += int(time)
                else:
                    others_time += int(time)

        feature = {
            'subject_id': subj,
            'date': date,
            'hour': hour,
            'others_time': others_time
        }
        feature.update({f"{app}_time": app_time[app] for app in top_apps})
        rows.append(feature)

    df_feat = pd.DataFrame(rows)

    # wide pivot
    wide = df_feat.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f"{col}_{hour:02d}" for col, hour in wide.columns]
    wide = wide.reset_index()

    return wide
mUsageStats_hourly_wide_df = process_mUsageStats_hourly_wide(mUsageStats_train_hourly)


In [21]:
import ast
import numpy as np
import pandas as pd

def process_mWifi_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    results = []

    for (subj, date, hour), group in df.groupby(['subject_id', 'date', 'hour']):
        rssi_all = []

        for row in group['m_wifi']:
            parsed = ast.literal_eval(row) if isinstance(row, str) else row
            for ap in parsed:
                try:
                    rssi = int(ap['rssi'])
                    rssi_all.append(rssi)
                except:
                    continue

        results.append({
            'subject_id': subj,
            'date': date,
            'hour': hour,
            'wifi_rssi_mean': np.mean(rssi_all) if rssi_all else np.nan,
            'wifi_rssi_min': np.min(rssi_all) if rssi_all else np.nan,
            'wifi_rssi_max': np.max(rssi_all) if rssi_all else np.nan,
            'wifi_detected_cnt': len(rssi_all)
        })

    df_hourly = pd.DataFrame(results)

    # wide pivot
    wide = df_hourly.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f"{col}_{hour:02d}" for col, hour in wide.columns]
    wide = wide.reset_index()

    return wide
mWifi_hourly_wide_df = process_mWifi_hourly_wide(mWifi_train_hourly)



In [22]:
import ast
import numpy as np
import pandas as pd

def process_wHr_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    records = []

    for (subj, date, hour), group in df.groupby(['subject_id', 'date', 'hour']):
        hr_all = []
        for row in group['heart_rate']:
            parsed = ast.literal_eval(row) if isinstance(row, str) else row
            hr_all.extend([int(h) for h in parsed if h is not None])

        if not hr_all:
            continue

        above_100 = [hr for hr in hr_all if hr > 100]

        records.append({
            'subject_id': subj,
            'date': date,
            'hour': hour,
            'hr_mean': np.mean(hr_all),
            'hr_std': np.std(hr_all),
            'hr_max': np.max(hr_all),
            'hr_min': np.min(hr_all),
            'hr_above_100_ratio': len(above_100) / len(hr_all)
        })

    df_hourly = pd.DataFrame(records)

    # wide 포맷으로 피벗
    wide = df_hourly.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f"{metric}_{hour:02d}" for metric, hour in wide.columns]
    wide = wide.reset_index()

    return wide
wHr_hourly_wide_df = process_wHr_hourly_wide(wHr_train_hourly)


In [23]:
import numpy as np
import pandas as pd

def process_wLight_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    records = []

    for (subj, date, hour), group in df.groupby(['subject_id', 'date', 'hour']):
        lux = group['w_light'].dropna().values

        if len(lux) == 0:
            continue

        records.append({
            'subject_id': subj,
            'date': date,
            'hour': hour,
            'wlight_mean': np.mean(lux),
            'wlight_std': np.std(lux),
            'wlight_max': np.max(lux),
            'wlight_min': np.min(lux)
        })

    df_hourly = pd.DataFrame(records)

    # wide pivot
    wide = df_hourly.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f"{metric}_{hour:02d}" for metric, hour in wide.columns]
    wide = wide.reset_index()

    return wide
wLight_hourly_wide_df = process_wLight_hourly_wide(wLight_train_hourly)


In [24]:
def process_wPedo_hourly_wide(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    # 시간 단위 요약
    agg = df.groupby(['subject_id', 'date', 'hour']).agg({
        'step': 'sum',
        'step_frequency': 'mean',
        'distance': 'sum',
        'speed': ['mean', 'max'],
        'burned_calories': 'sum'
    }).reset_index()

    # 컬럼 정리
    agg.columns = ['subject_id', 'date', 'hour',
                   'step_sum', 'step_frequency_mean',
                   'distance_sum', 'speed_mean', 'speed_max',
                   'burned_calories_sum']

    # wide pivot
    wide = agg.pivot(index=['subject_id', 'date'], columns='hour')
    wide.columns = [f"{metric}_{hour:02d}" for metric, hour in wide.columns]
    wide = wide.reset_index()

    return wide
wPedo_hourly_wide_df = process_wPedo_hourly_wide(wPedo_train_hourly)


In [25]:
from functools import reduce

feature_dfs = [
    mACStatus_hourly_wide_df,
    mActivity_hourly_wide_df,
    mAmbience_hourly_wide_df,
    mBle_hourly_wide_df,
    mGps_hourly_wide_df,
    mLight_hourly_wide_df,
    mScreenStatus_hourly_wide_df,
    mUsageStats_hourly_wide_df,
    mWifi_hourly_wide_df,
    wHr_hourly_wide_df,
    wLight_hourly_wide_df,
    wPedo_hourly_wide_df  # 마지막 피처
]

merged_hourly_df = reduce(
    lambda left, right: pd.merge(left, right, on=['subject_id', 'date'], how='outer'),
    feature_dfs
)
merged_hourly_df = merged_hourly_df.sort_values(['subject_id', 'date']).reset_index(drop=True)


In [26]:
def expand_labels_to_hourly(metrics_train):
    metrics_train = metrics_train.copy()
    metrics_train['date'] = pd.to_datetime(metrics_train['lifelog_date']).dt.date
    hourly_rows = []

    for hour in range(24):
        temp = metrics_train.copy()
        temp['hour'] = hour
        hourly_rows.append(temp)

    return pd.concat(hourly_rows, ignore_index=True)


In [27]:
def expand_submission_to_hourly(sample_submission):
    sample_submission = sample_submission.copy()
    sample_submission['date'] = pd.to_datetime(sample_submission['lifelog_date']).dt.date
    hourly_rows = []

    for hour in range(24):
        temp = sample_submission.copy()
        temp['hour'] = hour
        hourly_rows.append(temp)

    return pd.concat(hourly_rows, ignore_index=True)


In [28]:
def reshape_wide_to_long(df):
    df = df.copy()
    id_cols = ['subject_id', 'date']
    variable_cols = df.drop(columns=id_cols).columns

    long_rows = []
    for col in variable_cols:
        parts = col.rsplit('_', 1)
        if len(parts) != 2:
            continue
        var, hour = parts
        if hour.isdigit():
            long_rows.append((var, int(hour)))

    # 그룹화하여 중복 없이 정리
    from collections import defaultdict
    grouped = defaultdict(list)
    for var, hour in long_rows:
        grouped[hour].append(var)

    reshaped = []
    for hour in sorted(grouped.keys()):
        cols = [f"{var}_{hour:02d}" for var in grouped[hour] if f"{var}_{hour:02d}" in df.columns]
        temp = df[['subject_id', 'date'] + cols].copy()
        temp = temp.rename(columns={c: c.rsplit('_', 1)[0] for c in cols})
        temp['hour'] = hour
        reshaped.append(temp)

    # 안전하게 병합
    merged = pd.concat(reshaped, axis=0, ignore_index=True)
    return merged.sort_values(['subject_id', 'date', 'hour']).reset_index(drop=True)


In [29]:
# 1시간 단위 라벨 확장
metrics_train_hourly = expand_labels_to_hourly(metrics_train)
submission_hourly = expand_submission_to_hourly(sample_submission)

# 센서 wide → long
merged_hourly_long = reshape_wide_to_long(merged_hourly_df)

# 병합
train_hourly_df = pd.merge(metrics_train_hourly, merged_hourly_long, on=['subject_id', 'date', 'hour'], how='inner')
test_hourly_df = pd.merge(submission_hourly, merged_hourly_long, on=['subject_id', 'date', 'hour'], how='left')


In [30]:
print(globals().keys())  # 현재 존재하는 변수 확인




In [32]:
# ✅ 저장할 경로
save_dir = "/content/drive/MyDrive/ETRI/processed_data"
os.makedirs(save_dir, exist_ok=True)

# ✅ CSV 저장
train_hourly_df.to_csv(f"{save_dir}/train_hourly_df.csv", index=False, encoding='utf-8-sig')
test_hourly_df.to_csv(f"{save_dir}/test_hourly_df.csv", index=False, encoding='utf-8-sig')
merged_hourly_df.to_csv(f"{save_dir}/merged_hourly_df.csv", index=False, encoding='utf-8-sig')
metrics_train.to_csv(f"{save_dir}/metrics_train.csv", index=False, encoding='utf-8-sig')
sample_submission.to_csv(f"{save_dir}/submission_sample.csv", index=False, encoding='utf-8-sig')

# ✅ Feather 저장 (빠르고 가벼움)
train_hourly_df.to_feather(f"{save_dir}/train_hourly_df.feather")
test_hourly_df.to_feather(f"{save_dir}/test_hourly_df.feather")
merged_hourly_df.to_feather(f"{save_dir}/merged_hourly_df.feather")

print("✅ 1시간 단위 파일 저장 완료")


✅ 1시간 단위 파일 저장 완료
