In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# PyTorch Forecasting 관련 라이브러리
import torch
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, MAE
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

In [2]:
# 데이터 로드
print("데이터 로딩 중...")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

# 날짜 컬럼 datetime 형태로 변환
train_df['일시'] = pd.to_datetime(train_df['일시'])
test_df['일시'] = pd.to_datetime(test_df['일시'])

# 건물번호 str타입으로 변환
train_df['건물번호'] = train_df['건물번호'].astype(str)
test_df['건물번호'] = test_df['건물번호'].astype(str)

데이터 로딩 중...


In [3]:
# 시간 관련 특성 생성
def create_time_features(df):
    df['hour'] = df['일시'].dt.hour
    df['day'] = df['일시'].dt.day
    df['month'] = df['일시'].dt.month
    df['weekday'] = df['일시'].dt.weekday
    df['day_of_year'] = df['일시'].dt.dayofyear
    
    # 주말 여부
    df['is_weekend'] = (df['weekday'] >= 5).astype(str)
    
    # 시간대별 구분 (새벽, 아침, 낮, 저녁, 밤)
    df['time_segment'] = pd.cut(df['hour'], 
                                bins=[-1, 6, 9, 12, 18, 21, 24],
                                labels=['dawn', 'morning', 'noon', 'afternoon', 'evening', 'night'])
    
    return df

# 특성 생성
print("특성 생성 중...")
train_df = create_time_features(train_df)
test_df = create_time_features(test_df)

# 건물별로 그룹화하여 시계열 순서 확인
train_df = train_df.sort_values(['건물번호', '일시']).reset_index(drop=True)

# TFT를 위한 시계열 인덱스 생성
train_df['time_idx'] = train_df.groupby('건물번호').cumcount()

# 전력사용량 로그 변환 (분포 정규화)
train_df['log_전력사용량'] = np.log1p(train_df['전력소비량(kWh)'])

# 학습/검증 데이터 분할
# 30일 인코더 + 7일 예측 = 37일
validation_cutoff = train_df['일시'].max() - timedelta(days=37)
training = train_df[train_df['일시'] <= validation_cutoff]
validation = train_df[train_df['일시'] > validation_cutoff]

print(f"학습 데이터: {len(training)} 샘플")
print(f"검증 데이터: {len(validation)} 샘플")

특성 생성 중...
학습 데이터: 115200 샘플
검증 데이터: 88800 샘플


In [4]:
# TimeSeriesDataSet 생성
max_prediction_length = 24 * 7  # 7일 예측
max_encoder_length = 24 * 30   # 30일 과거 데이터 사용

# 특성 설정
time_varying_known_reals = ['hour', 'day', 'month', 'weekday', 'day_of_year',
                           '기온(°C)', '풍속(m/s)', '습도(%)', '강수량(mm)', 
                           '일조(hr)', '일사(MJ/m2)']

time_varying_unknown_reals = ['log_전력사용량']

static_categoricals = ['건물번호']

time_varying_known_categoricals = ['is_weekend', 'time_segment']

In [5]:
# Training dataset
training_dataset = TimeSeriesDataSet(
    training,
    time_idx='time_idx',
    target='log_전력사용량',
    group_ids=['건물번호'],
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=static_categoricals,
    time_varying_known_categoricals=time_varying_known_categoricals,
    time_varying_known_reals=time_varying_known_reals,
    time_varying_unknown_reals=time_varying_unknown_reals,
    target_normalizer=GroupNormalizer(
        groups=['건물번호'],
        transformation="softplus"
    ),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# Validation dataset
validation_dataset = TimeSeriesDataSet.from_dataset(
    training_dataset, 
    validation, 
    predict=True, 
    stop_randomization=True
)

In [6]:
# DataLoader 생성
batch_size = 1024
train_dataloader = training_dataset.to_dataloader(
    train=True, 
    batch_size=batch_size, 
    num_workers=8
)
val_dataloader = validation_dataset.to_dataloader(
    train=False, 
    batch_size=batch_size * 2, 
    num_workers=4
)

In [8]:
# TFT 모델 생성
print("TFT 모델 생성 중...")
tft = TemporalFusionTransformer.from_dataset(
    training_dataset,
    learning_rate=0.03,
    hidden_size=64,
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=32,
    loss=SMAPE(),
    log_interval=10,
    reduce_on_plateau_patience=4,
)

# 학습 설정
early_stop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=1e-4,
    patience=10,
    verbose=True,
    mode="min"
)

checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    filename="tft-{epoch:02d}-{val_loss:.4f}",
    save_top_k=3,
    mode="min"
)

trainer = Trainer(
    max_epochs=50,
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    devices=1,
    gradient_clip_val=0.5,
    callbacks=[early_stop_callback, checkpoint_callback],
    logger=TensorBoardLogger("lightning_logs"),
    precision="bf16"
)

Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TFT 모델 생성 중...


In [10]:
# 모델 학습
print("모델 학습 시작...")
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)



You are using a CUDA device ('NVIDIA A40') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


모델 학습 시작...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | SMAPE                           | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 2.1 K 
3  | prescalers                         | ModuleDict                      | 1.0 K 
4  | static_variable_selection          | VariableSelectionNetwork        | 20.9 K
5  | encoder_variable_selection         | VariableSelectionNetwork        | 95.8 K
6  | decoder_variable_selection         | VariableSelectionNetwork        | 88.1 K
7  | static_context_variable_selection  | GatedResidualNetwork            | 16.8 K
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 16.8 K
9  | static_context_initial_cell_lstm 

Sanity Checking: 0it [00:00, ?it/s]

RuntimeError: "upsample_linear1d_out_frame" not implemented for 'BFloat16'

In [None]:
# 최적 모델 로드
best_model_path = checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [None]:
# 예측을 위한 전체 데이터셋 준비
print("예측 준비 중...")

# test 데이터에 대한 전처리
test_df = create_time_features(test_df)

# train과 test 결합하여 전체 시계열 데이터 생성
all_data = pd.concat([train_df, test_df], ignore_index=True)
all_data = all_data.sort_values(['건물번호', '일시']).reset_index(drop=True)
all_data['time_idx'] = all_data.groupby('건물번호').cumcount()

# test 기간 동안의 예측 수행
predictions = []

for building_num in all_data['건물번호'].unique():
    building_data = all_data[all_data['건물번호'] == building_num].copy()
    
    # 예측 시작 시점 찾기
    test_start_idx = building_data[building_data['일시'] >= test_df['일시'].min()].index[0]
    
    # 예측 수행
    for i in range(0, len(test_df[test_df['건물번호'] == building_num]), max_prediction_length):
        # 현재까지의 데이터로 dataset 생성
        current_data = building_data.iloc[:test_start_idx + i].copy()
        
        if len(current_data) < max_encoder_length:
            continue
            
        # 예측용 dataset 생성
        prediction_dataset = TimeSeriesDataSet.from_dataset(
            training_dataset,
            current_data,
            predict=True,
            stop_randomization=True
        )
        
        # 예측
        prediction_dataloader = prediction_dataset.to_dataloader(
            train=False, 
            batch_size=1, 
            num_workers=0
        )
        
        raw_predictions = best_tft.predict(
            prediction_dataloader,
            mode="raw",
            return_x=True
        )
        
        # 예측값 추출
        for j, pred in enumerate(raw_predictions.output):
            if j < len(test_df[test_df['건물번호'] == building_num]) - i:
                predictions.append({
                    '건물번호': building_num,
                    '일시': test_df[(test_df['건물번호'] == building_num)].iloc[i + j]['일시'],
                    'prediction': pred[0].item()
                })

In [None]:
# 예측 결과 정리
print("예측 결과 정리 중...")
predictions_df = pd.DataFrame(predictions)

# 로그 역변환
predictions_df['전력소비량(kWh)'] = np.expm1(predictions_df['prediction'])

# submission 형식에 맞게 정리
final_predictions = predictions_df[['건물번호', '일시', '전력소비량(kWh)']]
final_predictions['일시'] = final_predictions['일시'].dt.strftime('%Y-%m-%d %H')

# sample_submission과 동일한 순서로 정렬
submission_df = submission_df.merge(
    final_predictions, 
    on=['건물번호', '일시'], 
    how='left'
)

# 누락된 값 처리 (건물별 평균값으로 대체)
for building in submission_df['건물번호'].unique():
    building_mask = submission_df['건물번호'] == building
    if submission_df.loc[building_mask, '전력소비량(kWh)'].isna().any():
        mean_value = train_df[train_df['건물번호'] == building]['전력소비량(kWh)'].mean()
        submission_df.loc[building_mask, '전력소비량(kWh)'] = submission_df.loc[building_mask, '전력소비량(kWh)'].fillna(mean_value)

In [None]:
# 제출 파일 저장
submission_df[['num_date_time', '건물번호', '일시', '전력소비량(kWh)']].to_csv('submission_tft.csv', index=False)
print("제출 파일 저장 완료: submission_tft.csv")

# 예측 결과 시각화
plt.figure(figsize=(15, 8))
sample_building = submission_df['건물번호'].unique()[0]
sample_data = submission_df[submission_df['건물번호'] == sample_building]

plt.plot(range(len(sample_data)), sample_data['전력소비량(kWh)'], label=f'Building {sample_building} Predictions')
plt.xlabel('Time Index')
plt.ylabel('전력소비량(kWh)')
plt.title('TFT Model Predictions Sample')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('tft_predictions_sample.png')
plt.show()

print("모델 학습 및 예측 완료!")