In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import lightgbm as lgb
from utils import feature_eng
from CMI_2025 import score
from sklearn.metrics import f1_score


In [2]:
train = pd.read_csv('../data/train.csv')
train = feature_eng(train)

In [3]:
target_gestures = [
            'Above ear - pull hair',
            'Cheek - pinch skin',
            'Eyebrow - pull hair',
            'Eyelash - pull hair',
            'Forehead - pull hairline',
            'Forehead - scratch',
            'Neck - pinch skin',
            'Neck - scratch',
        ]
non_target_gestures = [
            'Write name on leg',
            'Wave hello',
            'Glasses on/off',
            'Text on phone',
            'Write name in air',
            'Feel around in tray and pull out an object',
            'Scratch knee/leg skin',
            'Pull air toward your face',
            'Drink from bottle/cup',
            'Pinch knee/leg skin'
        ]
all_classes = target_gestures + non_target_gestures

In [4]:
train = train[train["gesture"].isin(target_gestures)]

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
meta = {'gesture','gesture_int','sequence_type','behavior','orientation',
                'row_id','subject','phase','sequence_id','sequence_counter'}

feat_cols = [c for c in train.columns if c not in meta]
imu_cols =  [c for c in feat_cols
                    if not (c.startswith("thm_") or c.startswith("tof_"))]

X = []
y = []
for _, df_seq in train.groupby("sequence_id"):
    # 行動が "Performs gesture" の区間だけ抽出
    target = df_seq.loc[df_seq["behavior"] == "Performs gesture", imu_cols]
    target = df_seq[imu_cols]

    # そのシーケンスに該当区間が無い場合はスキップ
    if target.empty:
        continue

    # 平均と分散を交互に並べた 1 次元リストを作成
    # 例）[acc_x_mean, acc_x_var, acc_y_mean, acc_y_var, ...]
    lag_periods = [1,2,3]
    for col in imu_cols:
        for lag in lag_periods:
            # 新しいラグ列の名前を作成 (例: acc_x_lag_1)
            lagged_col_name = f"{col}_lag_{lag}"
            # shift() メソッドで指定したラグ期間だけデータをずらします
            target[lagged_col_name] = target[col].shift(lag)

    target_2 = df_seq.loc[df_seq["behavior"] == "Hand at target location", imu_cols]
    feats = []
    for col in imu_cols:
        feats.append(target[col].mean())
        feats.append(target[col].var())
        feats.append(target[col].max())
        feats.append(target[col].min())
        feats.append(target_2[col].mean())
        feats.append(target_2[col].var())
        feats.append(target_2[col].max())
        feats.append(target_2[col].min())
    for col in imu_cols:
        for lag in lag_periods:
            lagged_col_name = f"{col}_lag_{lag}"
            if lagged_col_name in target.columns:
                # shift()によって生成されたNaN値を無視して統計量を計算するために skipna=True を使用
                feats.append(target[lagged_col_name].mean(skipna=True))
                feats.append(target[lagged_col_name].var(skipna=True))
                feats.append(target[lagged_col_name].max(skipna=True))
                feats.append(target[lagged_col_name].min(skipna=True))
            else:
                # もし何らかの理由でラグ列が存在しない場合（通常は発生しないはずですが、念のため）
                # NaNを追加して特徴量の次元を維持します。
                feats.extend([np.nan, np.nan, np.nan, np.nan])

    # シーケンス単位で蓄積
    X.append(feats)
    y.append(df_seq["gesture"].iloc[0])

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)


In [10]:
X = pd.DataFrame(X)
y = pd.DataFrame(y)

In [11]:
target_gestures = [
            'Above ear - pull hair',
            'Cheek - pinch skin',
            'Eyebrow - pull hair',
            'Eyelash - pull hair',
            'Forehead - pull hairline',
            'Forehead - scratch',
            'Neck - pinch skin',
            'Neck - scratch',
        ]
non_target_gestures = [
            'Write name on leg',
            'Wave hello',
            'Glasses on/off',
            'Text on phone',
            'Write name in air',
            'Feel around in tray and pull out an object',
            'Scratch knee/leg skin',
            'Pull air toward your face',
            'Drink from bottle/cup',
            'Pinch knee/leg skin'
        ]
all_classes = target_gestures + non_target_gestures

In [12]:
def binary_score(sol, sub):
    y_true_bin = [1 if i in target_gestures else 0 for i in sol]
    y_pred_bin = [1 if i in target_gestures else 0 for i in sub]
    f1_binary = f1_score(
            y_true_bin,
            y_pred_bin,
            pos_label=True,
            zero_division=0,
            average='binary'
        )
    
    return 0.5 * f1_binary

def macro_score(sol, sub):
    y_true_mc = [x if x in target_gestures else 'non_target' for x in sol]
    y_pred_mc = [x if x in target_gestures else 'non_target' for x in sub]

        # Compute macro F1 over all gesture classes
    f1_macro = f1_score(
            y_true_mc,
            y_pred_mc,
            average='macro',
            zero_division=0
        )
    
    return 0.5 * f1_macro


In [13]:
y_oof = np.zeros(len(y))
y_oof = pd.DataFrame(y_oof)

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    model = lgb.LGBMClassifier(
        objective='multiclass',
        n_estimators= 1000,
        learning_rate= 0.08,
        max_depth= 15,
        reg_alpha= 0.8,
        lambda_l2= 4.0,  
        num_leaves=31, 
        min_child_samples= 32,
        colsample_bytree= 0.85,
        subsample= 0.5,
        subsample_freq=0,
        cat_smooth=20.0,
        is_unbalance=True,
        max_bin=127,
        verbose=-1,  
        metric='multi_logloss'
    )
    
    # Train model with verbose output
    model.fit(
        X.iloc[tr_idx], y.iloc[tr_idx],
        eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],  
        eval_metric='multi_logloss'
    )
    # print(f"fold_{fold+1}のbinary_scoreは")
    # print(binary_score(y.iloc[val_idx][0], model.predict(X.iloc[val_idx])))
    print(f"fold_{fold+1}のmacro_scoreは")
    print(macro_score(y.iloc[val_idx][0], model.predict(X.iloc[val_idx])))


fold_1のmacro_scoreは
0.19277907199350605
fold_2のmacro_scoreは
0.20480242351659694
fold_3のmacro_scoreは
0.1928684865088954
fold_4のmacro_scoreは
0.1912927801161224
fold_5のmacro_scoreは
0.1874240101713282
