In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from typing import List, Dict, Union

In [3]:
def prepare_data(df: pd.DataFrame, window_size: int = 10080):
    # P로 시작하는 컬럼 추출
    feature_columns = df.filter(regex='^P\\d+$').columns
    
    # 윈도우 기반 특성 생성
    X = []
    y = []
    
    for i in range(len(df) - window_size):
        window = df[feature_columns].iloc[i:i + window_size]
        # 통계적 특성 추출
        features = pd.concat([
            window.mean(),
            window.std(),
            window.min(),
            window.max(),
            window.kurt(),
            window.skew()
        ])
        X.append(features)
        y.append(df['anomaly'].iloc[i + window_size])
    
    return np.array(X), np.array(y)

In [4]:
def train_xgboost(X_train, y_train):
    model = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    model.fit(
        X_train, 
        y_train,
        eval_metric=['auc', 'logloss'],
        verbose=True
    )
    
    return model

In [5]:
def detect_anomalies(model, X_test, threshold=0.5):
    # 확률값 예측
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    # 임계값 기반 이상 탐지
    anomalies = (y_pred_proba > threshold).astype(int)
    return anomalies, y_pred_proba

In [7]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df: pd.DataFrame, stride: int = 1, inference: bool = False) -> None:
        self.inference = inference
        # P로 시작하는 컬럼만 추출하고 정렬
        self.column_names = sorted(df.filter(regex='^P\\d+$').columns.tolist(),
                                 key=lambda x: int(x.replace('P', '')))
        self.file_ids = df['file_id'].values if 'file_id' in df.columns else None
        
        if inference:
            self.values = df[self.column_names].values.astype(np.float32)
            self._prepare_inference_data()
        else:
            self._prepare_training_data(df, stride)

NameError: name 'Dataset' is not defined

In [6]:
# 데이터 로드
df_A = pd.read_csv("./train/TRAIN_A.csv")
df_B = pd.read_csv("./train/TRAIN_B.csv")

# 데이터 준비
X_A, y_A = prepare_data(df_A)
X_B, y_B = prepare_data(df_B)

# 데이터 결합
X = np.concatenate([X_A, X_B])
y = np.concatenate([y_A, y_B])

# 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 모델 학습
model = train_xgboost(X_train, y_train)

# 이상 탐지 수행
anomalies, probabilities = detect_anomalies(model, X_test)

# 결과 저장
results = pd.DataFrame({
    'anomaly': anomalies,
    'probability': probabilities
})

results.to_csv('xgboost_predictions.csv', index=False)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 156 and the array at index 1 has size 60

In [None]:
def inference_test_files(model, test_directory):
    test_files = [f for f in os.listdir(test_directory) if f.endswith('.csv')]
    all_predictions = []
    
    for file in test_files:
        df = pd.read_csv(f"{test_directory}/{file}")
        X_test, _ = prepare_data(df)
        anomalies, _ = detect_anomalies(model, X_test)
        
        predictions = {
            'ID': file.replace('.csv', ''),
            'flag_list': anomalies.tolist()
        }
        all_predictions.append(predictions)
    
    return pd.DataFrame(all_predictions)