# 🚀 GTM Step 2: Temporal + Image + Google Trends

## 📚 특강 2단계: 이미지 정보 추가
- **사용 모달리티**: Temporal Features + **Image Features** + Google Trends
- **목적**: 시각적 정보를 활용한 매출 예측 성능 향상
- **학습 목표**: 
  - CNN을 통한 이미지 특성 추출
  - Multi-modal 특성 융합 (Temporal + Visual)
  - ResNet50을 사용한 전이학습

## 1. 📦 패키지 설치 및 import

In [None]:
# 패키지 설치
!pip install lightning --upgrade --quiet
!pip install transformers scikit-learn pillow --quiet

# Import
import math
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import lightning as L
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image, ImageFile
from torch.utils.data import DataLoader, TensorDataset
from torchvision.transforms import Resize, ToTensor, Normalize, Compose
from torchvision import models  # ResNet50 사용
from sklearn.preprocessing import MinMaxScaler
from transformers import Adafactor
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

ImageFile.LOAD_TRUNCATED_IMAGES = True

# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

print(f"✅ PyTorch: {torch.__version__}")
print(f"✅ Lightning: {L.__version__}")
print(f"✅ CUDA 사용 가능: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")

## 2. 🧠 모델 컴포넌트 정의
### 2단계에서는 이미지 인코더 추가

In [None]:
# 기본 모듈들 (1단계와 동일)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=52):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TimeDistributed(nn.Module):
    def __init__(self, module, batch_first=True):
        super(TimeDistributed, self).__init__()
        self.module = module
        self.batch_first = batch_first

    def forward(self, x):
        if len(x.size()) <= 2:
            return self.module(x)

        x_reshape = x.contiguous().view(-1, x.size(-1))  
        y = self.module(x_reshape)

        if self.batch_first:
            y = y.contiguous().view(x.size(0), -1, y.size(-1))
        else:
            y = y.view(-1, x.size(1), y.size(-1))

        return y

print("✅ 기본 모듈 정의 완료")

In [None]:
# 2단계: Dummy + Image + GTrends 인코더
class DummyEmbedder(nn.Module):
    """시간 정보 (날짜) 임베딩"""
    def __init__(self, embedding_dim):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.day_embedding = nn.Linear(1, embedding_dim)
        self.week_embedding = nn.Linear(1, embedding_dim)
        self.month_embedding = nn.Linear(1, embedding_dim)
        self.year_embedding = nn.Linear(1, embedding_dim)
        self.dummy_fusion = nn.Linear(embedding_dim*4, embedding_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, temporal_features):
        d, w, m, y = temporal_features[:, 0].unsqueeze(1), temporal_features[:, 1].unsqueeze(1), \
            temporal_features[:, 2].unsqueeze(1), temporal_features[:, 3].unsqueeze(1)
        d_emb, w_emb, m_emb, y_emb = self.day_embedding(d), self.week_embedding(w), self.month_embedding(m), self.year_embedding(y)
        temporal_embeddings = self.dummy_fusion(torch.cat([d_emb, w_emb, m_emb, y_emb], dim=1))
        temporal_embeddings = self.dropout(temporal_embeddings)
        return temporal_embeddings

class ImageEmbedder(nn.Module):
    """이미지 특성 추출 (ResNet50 사용)"""
    def __init__(self):
        super().__init__()
        # ResNet50 사전 훈련된 모델 사용
        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-2]  # 마지막 2개 레이어 제거
        self.resnet = nn.Sequential(*modules)
        
        # 모든 ResNet 파라미터를 trainable로 설정
        for p in self.resnet.parameters():
            p.requires_grad = True
        
    def forward(self, images):        
        img_embeddings = self.resnet(images)  # [batch_size, 2048, 8, 8]
        size = img_embeddings.size()
        out = img_embeddings.view(*size[:2],-1)
        return out.view(*size).contiguous()  # [batch_size, 2048, 8, 8]

class GTrendEmbedder(nn.Module):
    """Google Trends 데이터 인코딩"""
    def __init__(self, forecast_horizon, embedding_dim, use_mask, trend_len, num_trends, gpu_num):
        super().__init__()
        self.forecast_horizon = forecast_horizon
        self.input_linear = TimeDistributed(nn.Linear(num_trends, embedding_dim))
        self.pos_embedding = PositionalEncoding(embedding_dim, max_len=trend_len)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=4, dropout=0.2)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.use_mask = use_mask
        self.gpu_num = gpu_num

    def _generate_encoder_mask(self, size, forecast_horizon):
        mask = torch.zeros((size, size))
        split = math.gcd(size, forecast_horizon)
        for i in range(0, size, split):
            mask[i:i+split, i:i+split] = 1
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, gtrends):
        gtrend_emb = self.input_linear(gtrends.permute(0,2,1))
        gtrend_emb = self.pos_embedding(gtrend_emb.permute(1,0,2))
        input_mask = self._generate_encoder_mask(gtrend_emb.shape[0], self.forecast_horizon).to(gtrend_emb.device)
        if self.use_mask == 1:
            gtrend_emb = self.encoder(gtrend_emb, input_mask)
        else:
            gtrend_emb = self.encoder(gtrend_emb)
        return gtrend_emb

class FusionNetwork(nn.Module):
    """2단계: 시간 + 이미지 특성 융합"""
    def __init__(self, embedding_dim, hidden_dim, dropout=0.2):
        super(FusionNetwork, self).__init__()
        
        # 이미지 특성 처리
        self.img_pool = nn.AdaptiveAvgPool2d((1,1))  # Global Average Pooling
        self.img_linear = nn.Linear(2048, embedding_dim)  # ResNet50 출력 크기: 2048
        
        # 시간 + 이미지 융합
        input_dim = embedding_dim + embedding_dim  # temporal + image
        self.feature_fusion = nn.Sequential(
            nn.BatchNorm1d(input_dim),
            nn.Linear(input_dim, input_dim, bias=False),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(input_dim, hidden_dim)
        )

    def forward(self, img_encoding, dummy_encoding):
        # 이미지 특성 처리
        pooled_img = self.img_pool(img_encoding)  # [batch, 2048, 1, 1]
        condensed_img = self.img_linear(pooled_img.flatten(1))  # [batch, embedding_dim]

        # 시간 + 이미지 특성 결합
        concat_features = torch.cat([dummy_encoding, condensed_img], dim=1)
        final = self.feature_fusion(concat_features)
        
        return final

class TransformerDecoderLayer(nn.Module):
    """커스텀 트랜스포머 디코더 레이어"""
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
        super(TransformerDecoderLayer, self).__init__()
        
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)

        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

        self.activation = F.relu

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, 
            memory_key_padding_mask=None, tgt_is_causal=None, memory_is_causal=None):
        
        # Self-attention block
        tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        
        # Cross-attention block
        tgt2, attn_weights = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
                                                  key_padding_mask=memory_key_padding_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        
        # Feedforward block
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        
        return tgt, attn_weights

print("✅ 2단계 인코더 (Dummy + Image + GTrends) 정의 완료")

## 3. 🎯 GTM Step 2 모델

In [None]:
class GTM_Step2(L.LightningModule):
    """2단계: Temporal Features + Image Features + Google Trends 사용"""
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_heads, num_layers, 
                 cat_dict, col_dict, fab_dict, trend_len, num_trends, gpu_num, use_encoder_mask=1, autoregressive=False):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.output_len = output_dim
        self.use_encoder_mask = use_encoder_mask
        self.autoregressive = autoregressive
        self.gpu_num = gpu_num
        self.save_hyperparameters()

        # 2단계에서는 Dummy + Image + GTrend 사용
        self.dummy_encoder = DummyEmbedder(embedding_dim)
        self.image_encoder = ImageEmbedder()  # 🆕 이미지 인코더 추가
        self.gtrend_encoder = GTrendEmbedder(output_dim, hidden_dim, use_encoder_mask, trend_len, num_trends, gpu_num)
        
        # 2단계 융합 네트워크 (시간 + 이미지)
        self.feature_fusion = FusionNetwork(embedding_dim, hidden_dim)

        # Decoder
        self.decoder_layer = TransformerDecoderLayer(d_model=self.hidden_dim, nhead=num_heads, 
                                                    dim_feedforward=self.hidden_dim * 4, dropout=0.1)
        
        if self.autoregressive: 
            self.pos_encoder = PositionalEncoding(hidden_dim, max_len=12)
        
        self.decoder_fc = nn.Sequential(
            nn.Linear(hidden_dim, self.output_len if not self.autoregressive else 1),
            nn.Dropout(0.2)
        )
        
    def _generate_square_subsequent_mask(self, size):
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, category, color, fabric, temporal_features, gtrends, images):
        # 2단계: 시간 정보 + 이미지 정보 사용
        dummy_encoding = self.dummy_encoder(temporal_features)
        img_encoding = self.image_encoder(images)  # 🆕 이미지 인코딩
        gtrend_encoding = self.gtrend_encoder(gtrends)

        # Multi-modal 특성 융합 (시간 + 이미지)
        static_feature_fusion = self.feature_fusion(img_encoding, dummy_encoding)

        # Decoder
        tgt = static_feature_fusion.unsqueeze(0)
        memory = gtrend_encoding
        
        decoder_out, attn_weights = self.decoder_layer(tgt, memory)
        forecast = self.decoder_fc(decoder_out)

        return forecast.view(-1, self.output_len), attn_weights

    def configure_optimizers(self):
        optimizer = Adafactor(self.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
        return optimizer

    def training_step(self, batch, batch_idx):
        item_sales, category, color, fabric, temporal_features, gtrends, images = batch 
        
        temporal_features = temporal_features.requires_grad_(True)
        gtrends = gtrends.requires_grad_(True)
        images = images.requires_grad_(True)  # 🆕 이미지 gradient 활성화
        
        forecasted_sales, _ = self.forward(category, color, fabric, temporal_features, gtrends, images)
        loss = F.mse_loss(item_sales, forecasted_sales.squeeze())
        
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        item_sales, category, color, fabric, temporal_features, gtrends, images = batch 
        forecasted_sales, _ = self.forward(category, color, fabric, temporal_features, gtrends, images)
        
        if not hasattr(self, 'validation_step_outputs'):
            self.validation_step_outputs = []
        self.validation_step_outputs.append((item_sales.squeeze(), forecasted_sales.squeeze()))
        
        return item_sales.squeeze(), forecasted_sales.squeeze()

    def on_validation_epoch_end(self):
        if hasattr(self, 'validation_step_outputs'):
            val_step_outputs = self.validation_step_outputs
            item_sales, forecasted_sales = [x[0] for x in val_step_outputs], [x[1] for x in val_step_outputs]
            item_sales, forecasted_sales = torch.stack(item_sales), torch.stack(forecasted_sales)
            rescaled_item_sales, rescaled_forecasted_sales = item_sales*1065, forecasted_sales*1065
            loss = F.mse_loss(item_sales, forecasted_sales.squeeze())
            mae = F.l1_loss(rescaled_item_sales, rescaled_forecasted_sales)
            
            self.log('val_mae', mae, prog_bar=True)
            self.log('val_loss', loss, prog_bar=True)

            print(f'Step 2 - Validation MAE: {mae.detach().cpu().numpy():.2f}, LR: {self.optimizers().param_groups[0]["lr"]:.2e}')
            self.validation_step_outputs.clear()

print("✅ GTM Step 2 모델 정의 완료 (Temporal + Image + GTrends)")

## 4. 📊 데이터셋 클래스 (1단계와 동일)

In [None]:
class ZeroShotDataset():
    def __init__(self, data_df, img_root, gtrends, cat_dict, col_dict, fab_dict, trend_len):
        self.data_df = data_df
        self.gtrends = gtrends
        self.cat_dict = cat_dict
        self.col_dict = col_dict
        self.fab_dict = fab_dict
        self.trend_len = trend_len
        self.img_root = img_root

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        return self.data_df.iloc[idx, :]

    def preprocess_data(self):
        data = self.data_df

        gtrends, image_features = [], []
        img_transforms = Compose([Resize((256, 256)), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
        
        for (idx, row) in tqdm(data.iterrows(), total=len(data), ascii=True, desc="2단계 데이터 전처리"):
            cat, col, fab, fiq_attr, start_date, img_path = row['category'], row['color'], row['fabric'], row['extra'], \
                row['release_date'], row['image_path']

            # Google Trends 데이터 처리
            gtrend_start = start_date - pd.DateOffset(weeks=52)
            cat_gtrend = self.gtrends.loc[gtrend_start:start_date][cat][-52:].values[:self.trend_len]
            col_gtrend = self.gtrends.loc[gtrend_start:start_date][col][-52:].values[:self.trend_len]
            fab_gtrend = self.gtrends.loc[gtrend_start:start_date][fab][-52:].values[:self.trend_len]

            cat_gtrend = MinMaxScaler().fit_transform(cat_gtrend.reshape(-1,1)).flatten()
            col_gtrend = MinMaxScaler().fit_transform(col_gtrend.reshape(-1,1)).flatten()
            fab_gtrend = MinMaxScaler().fit_transform(fab_gtrend.reshape(-1,1)).flatten()
            multitrends = np.vstack([cat_gtrend, col_gtrend, fab_gtrend])

            # 이미지 처리 (2단계에서는 실제로 사용)
            img = Image.open(os.path.join(self.img_root, img_path)).convert('RGB')

            gtrends.append(multitrends)
            image_features.append(img_transforms(img))

        gtrends = np.array(gtrends)

        data = data.copy()
        data.drop(['external_code', 'season', 'release_date', 'image_path'], axis=1, inplace=True)

        # 텐서 생성
        item_sales, temporal_features = torch.FloatTensor(data.iloc[:, :12].values), torch.FloatTensor(
            data.iloc[:, 13:17].values)
        categories, colors, fabrics = [self.cat_dict[val] for val in data.iloc[:].category.values], \
                                       [self.col_dict[val] for val in data.iloc[:].color.values], \
                                       [self.fab_dict[val] for val in data.iloc[:].fabric.values]

        categories, colors, fabrics = torch.LongTensor(categories), torch.LongTensor(colors), torch.LongTensor(fabrics)
        gtrends = torch.FloatTensor(gtrends)
        images = torch.stack(image_features)

        return TensorDataset(item_sales, categories, colors, fabrics, temporal_features, gtrends, images)

    def get_loader(self, batch_size, train=True):
        print('📊 2단계 데이터셋 생성 시작...')
        data_with_gtrends = self.preprocess_data()
        if train:
            data_loader = DataLoader(data_with_gtrends, batch_size=batch_size, shuffle=True, num_workers=2)
        else:
            data_loader = DataLoader(data_with_gtrends, batch_size=1, shuffle=False, num_workers=2)
        print('✅ 2단계 데이터셋 생성 완료')
        return data_loader

print("✅ 데이터셋 클래스 정의 완료")

## 5. 🚀 2단계 실행 코드
### 이미지 정보까지 포함한 모델 훈련

In [None]:
# 데이터셋 경로 설정
dataset_path = Path('/content/drive/MyDrive/GTM-dataset-small/')

# 데이터 로딩
print("📊 데이터 로딩 중...")
train_df = pd.read_csv(dataset_path / 'train.csv', parse_dates=['release_date'])
test_df = pd.read_csv(dataset_path / 'test.csv', parse_dates=['release_date'])
gtrends = pd.read_csv(dataset_path / 'gtrends.csv', index_col=[0], parse_dates=True)

cat_dict = torch.load(dataset_path / 'category_labels.pt', weights_only=False)
col_dict = torch.load(dataset_path / 'color_labels.pt', weights_only=False)
fab_dict = torch.load(dataset_path / 'fabric_labels.pt', weights_only=False)

print(f"✅ 훈련 데이터: {len(train_df):,}개")
print(f"✅ 테스트 데이터: {len(test_df):,}개")
print(f"✅ Google Trends: {len(gtrends):,}개 시점")

In [None]:
# 데이터셋 생성
train_dataset = ZeroShotDataset(train_df, dataset_path / 'images', gtrends, cat_dict, col_dict, fab_dict, trend_len=52)
test_dataset = ZeroShotDataset(test_df, dataset_path / 'images', gtrends, cat_dict, col_dict, fab_dict, trend_len=52)

BATCH_SIZE = 8 if torch.cuda.is_available() else 4
train_loader = train_dataset.get_loader(batch_size=BATCH_SIZE, train=True)
test_loader = test_dataset.get_loader(batch_size=1, train=False)

print(f"✅ 배치 크기: {BATCH_SIZE}")
print(f"✅ 훈련 배치 수: {len(train_loader)}")
print(f"✅ 테스트 배치 수: {len(test_loader)}")

In [None]:
# 2단계 모델 생성
print("🎯 GTM Step 2 모델 생성 중...")

model = GTM_Step2(
    embedding_dim=32,
    hidden_dim=64,
    output_dim=12,
    num_heads=4,
    num_layers=1,
    cat_dict=cat_dict,
    col_dict=col_dict,
    fab_dict=fab_dict,
    trend_len=52,
    num_trends=3,
    gpu_num=0,
    use_encoder_mask=1,
    autoregressive=False
)

print(f"✅ Step 2 모델 생성 완료!")
print(f"📊 모델 파라미터: {sum(p.numel() for p in model.parameters()):,}")
print("\n🔍 사용 모달리티: Temporal Features + Image Features + Google Trends")
print("🆕 추가된 기능: ResNet50 기반 이미지 특성 추출")

In [None]:
# Trainer 설정 및 훈련
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger

EPOCHS = 5
ACCELERATOR = 'gpu' if torch.cuda.is_available() else 'cpu'

checkpoint_callback = ModelCheckpoint(
    dirpath='./checkpoints/',
    filename='gtm-step2-{epoch:02d}-{val_mae:.2f}',
    monitor='val_mae',
    mode='min',
    save_top_k=2
)

csv_logger = CSVLogger(save_dir='./logs/', name='gtm_step2')

trainer = L.Trainer(
    devices=1,
    accelerator=ACCELERATOR,
    max_epochs=EPOCHS,
    logger=csv_logger,
    callbacks=[checkpoint_callback],
    enable_progress_bar=True,
    gradient_clip_val=1.0
)

print("🚀 GTM Step 2 훈련 시작!")
print("=" * 50)

try:
    trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=test_loader)
    print("\n🎉 Step 2 훈련 완료!")
    print(f"💾 최고 모델: {checkpoint_callback.best_model_path}")
    
except Exception as e:
    print(f"\n❌ Step 2 훈련 실패: {e}")
    import traceback
    traceback.print_exc()

## 📋 2단계 요약

### ✅ 구현 완료
- **시간적 특성 임베딩**: 1단계와 동일 (날짜 정보)
- **🆕 이미지 특성 추출**: ResNet50으로 제품 이미지 분석
- **Multi-modal 융합**: 시간 + 시각적 정보 결합
- **Google Trends 인코딩**: 1단계와 동일
- **Cross-Attention**: 융합된 특성과 트렌드 데이터 간 관계 학습

### 🎯 학습 목표 달성
- CNN을 통한 이미지 특성 추출 이해
- Multi-modal 데이터 융합 기법 학습
- 전이학습 (Transfer Learning) 활용
- 시각적 정보가 매출 예측에 미치는 영향 분석

### 📈 성능 향상 포인트
- **1단계 대비**: 이미지 정보 추가로 예측 정확도 향상 기대
- **ResNet50 활용**: 사전 훈련된 모델로 강력한 시각적 특성 추출
- **Feature Fusion**: 다양한 모달리티 정보의 효과적 결합

### 🔜 다음 단계 예고
**Step 3**에서는 **텍스트 정보**까지 추가하여 완전한 Multi-modal 시스템을 완성합니다!