In [1]:
import torch
import torch.nn.functional as F
import pickle
import torch.nn as nn
import torch.optim as optim
import os
import pandas as pd
import io
import networkx as nx
import numpy as np

from torch_geometric.nn import SAGEConv, GATConv, GCNConv
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# GCN, GAT, GraphSAGE, Unet구조, skip-connection, residual Connections, 멀티-헤드 Attention 메커니즘
class m2_model(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(m2_model, self).__init__()
        # Down-sampling Path 제거됨
        self.sage1 = SAGEConv(num_node_features, 64)
        self.sage2 = SAGEConv(64, 128)

        # Bottleneck with multi-head attention
        self.gat = GATConv(128, 128, heads=8, concat=True)

        # Up-sampling Path
        self.gcn1 = GCNConv(128 * 8, 64)  # Adjusted for concatenated multi-head attention output
        self.gcn2 = GCNConv(64, num_classes)

        # Residual Connections and Dimension Matching
        self.res1 = torch.nn.Linear(num_node_features, 64)
        self.res2 = torch.nn.Linear(64, 128 * 8)  # Adjust for concatenated multi-head attention output
        
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        # Initial residual connections
        res_x = self.res1(x)

        # Contracting path without TopKPooling
        x = F.relu(self.sage1(x, edge_index)) + res_x
        x1 = x  # Skip connection

        x = F.relu(self.sage2(x, edge_index))

        # Bottleneck with GAT for attention mechanism
        x = F.relu(self.gat(x, edge_index))

        # Up-sampling path with GCN for refining features
        x = self.gcn1(x, edge_index)
        x = self.gcn2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [3]:
def load_dataset_and_splitter():
    with open('custom_dataset.pkl', 'rb') as f:
        custom_dataset = pickle.load(f)
    with open('data_splitter.pkl', 'rb') as f:
        data_splitter = pickle.load(f)
    return custom_dataset, data_splitter

In [4]:
# 저장된 객체를 불러옵니다.
custom_dataset, data_splitter = load_dataset_and_splitter()
print("Loaded dataset and splitter from saved files.")

Loaded dataset and splitter from saved files.


In [5]:
class CustomLoss(nn.Module):
    def __init__(self, base_loss_function=nn.CrossEntropyLoss()):
        super().__init__()
        self.base_loss_function = base_loss_function

    def forward(self, predictions, targets):
        # -1 레이블을 가진 타겟은 손실 계산에서 제외
        valid_indices = targets != -1
        if valid_indices.any():
            return self.base_loss_function(predictions[valid_indices], targets[valid_indices])
        else:
            return torch.tensor(0.0).to(predictions.device)  # 모든 타겟이 -1인 경우 0 반환

In [6]:
def accuracy(output, target):
    valid_indices = target != -1
    if valid_indices.any():
        preds = output[valid_indices].argmax(dim=1)
        correct = (preds == target[valid_indices]).float()
        acc = correct.sum() / len(correct)
        return acc
    else:
        return 0.0

# MAIN

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

model = m2_model(num_node_features=4, num_classes=41476).to(device)
custom_loss_function = CustomLoss()
learning_rate = 0.0001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = CustomLoss()

# 데이터 로더 생성 및 데이터 분할
train_loader, val_loader, test_loader = data_splitter.split_data()
epochs = 100


Using device: cuda




# TRAIN

In [8]:
# TensorBoard 요약 작성자 설정
writer = SummaryWriter('runs/experiment_name')

def save_checkpoint(state, filename="best_check/my_checkpoint_2rd.pth"):
        print("=> Saving checkpoint")
        torch.save(state, filename)

best_val_acc = 0.0  # 가장 높은 검증 정확도 저장을 위한 변수

for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        train_correct = 0
        train_total = 0
        progress_bar = tqdm(iter(train_loader), desc=f'Epoch {epoch+1}/{epochs}')
        for batch_idx, data in enumerate(progress_bar):
                data = data.to(device)
                outputs = model(data)
                loss = criterion(outputs, data.y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                train_acc = accuracy(outputs, data.y)  # 훈련 정확도 계산
                train_correct += (train_acc * data.y.size(0)).item()
                train_total += data.y.size(0)

                progress_bar.set_postfix(loss=running_loss/(batch_idx+1), train_acc=100. * train_correct / train_total)

        # 훈련 손실 및 정확도 로깅
        writer.add_scalar('training loss', running_loss / len(train_loader), epoch)
        writer.add_scalar('training accuracy', 100. * train_correct / train_total, epoch)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad(), tqdm(val_loader, desc='Validating') as progress_bar:
                for data in progress_bar:
                        data = data.to(device)
                        outputs = model(data)
                        loss = criterion(outputs, data.y)

                        val_loss += loss.item()
                        val_acc = accuracy(outputs, data.y)  # 검증 정확도 계산
                        val_correct += (val_acc * data.y.size(0)).item()
                        val_total += data.y.size(0)

                        progress_bar.set_postfix(val_loss=val_loss/len(val_loader), val_acc=100. * val_correct / val_total)

        # 검증 손실 및 정확도 로깅
        writer.add_scalar('validation loss', val_loss / len(val_loader), epoch)
        writer.add_scalar('validation accuracy', 100. * val_correct / val_total, epoch)

        # 체크포인트 저장 조건
        if 100. * val_correct / val_total > best_val_acc:
                best_val_acc = 100. * val_correct / val_total
                checkpoint_filename = f"best_check/checkpoint_epoch_{epoch+1}_4rd.pth"
                save_checkpoint({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': val_loss,
                'accuracy': best_val_acc,
                }, filename=checkpoint_filename)

writer.close()  # TensorBoard 작성자 닫기


Epoch 1/100:   0%|          | 0/2484 [00:00<?, ?it/s]

Epoch 1/100: 100%|██████████| 2484/2484 [00:28<00:00, 87.75it/s, loss=10.4, train_acc=0.403] 
Validating: 100%|██████████| 710/710 [00:04<00:00, 143.61it/s, val_acc=0.38, val_loss=10.5] 


=> Saving checkpoint


Epoch 2/100: 100%|██████████| 2484/2484 [00:25<00:00, 96.13it/s, loss=9.59, train_acc=0.352] 
Validating: 100%|██████████| 710/710 [00:04<00:00, 147.56it/s, val_acc=0.38, val_loss=11.4] 
Epoch 3/100: 100%|██████████| 2484/2484 [00:26<00:00, 95.47it/s, loss=8.89, train_acc=0.354] 
Validating: 100%|██████████| 710/710 [00:04<00:00, 151.09it/s, val_acc=0.17, val_loss=12.6] 
Epoch 4/100: 100%|██████████| 2484/2484 [00:25<00:00, 96.69it/s, loss=8.63, train_acc=0.313]
Validating: 100%|██████████| 710/710 [00:04<00:00, 145.46it/s, val_acc=0.0999, val_loss=13.7]
Epoch 5/100: 100%|██████████| 2484/2484 [00:25<00:00, 97.35it/s, loss=8.44, train_acc=0.293] 
Validating: 100%|██████████| 710/710 [00:04<00:00, 144.33it/s, val_acc=0.114, val_loss=15]  
Epoch 6/100:  28%|██▊       | 687/2484 [00:07<00:19, 93.86it/s, loss=8.29, train_acc=0.347] 


KeyboardInterrupt: 

# 테스트

In [9]:
class ExtendedLabelEncoder1:
    def __init__(self, base_encoder):
        # base_encoder는 LabelEncoder의 인스턴스입니다.
        self.base_encoder = base_encoder
    
    def fit(self, y):
        self.base_encoder.fit(y)
        return self

    def transform(self, y, unknown_label=-1):
        new_y = []
        for item in y:
            if item in self.base_encoder.classes_:
                # 기존 base_encoder의 transform 메서드를 사용하여 변환합니다.
                encoded = self.base_encoder.transform([item])[0]
                new_y.append(encoded)
            else:
                # 새로운 항목은 unknown_label 값으로 처리
                new_y.append(unknown_label)
        return np.array(new_y)
    
class ExtendedLabelEncoder2:
    def __init__(self, base_encoder):
        self.base_encoder = base_encoder  # 기존 LabelEncoder 인스턴스
        self.unknown_label_start = -1
        self.unknown_labels_dict = {}
    
    def fit(self, y):
        self.base_encoder.fit(y)
        return self

    def transform(self, y, unknown_label_start=-1):
        new_y = []
        for item in y:
            try:
                # base_encoder의 classes_를 참조하여 transform을 시도합니다.
                if item in self.base_encoder.classes_:
                    encoded = self.base_encoder.transform([item])[0]
                    new_y.append(encoded)
                else:
                    raise ValueError
            except ValueError:
                # 새로운 아이템 처리
                if item not in self.unknown_labels_dict:
                    self.unknown_labels_dict[item] = self.unknown_label_start
                    self.unknown_label_start -= 1
                new_y.append(self.unknown_labels_dict[item])
        return np.array(new_y)

In [10]:
# pickle 파일로부터 CustomDataset 객체 로드
with open('custom_dataset.pkl', 'rb') as f:
    loaded_custom_dataset = pickle.load(f)

In [16]:
class GraphDataU:
    def __init__(self, df):
        self.df = df
        self.user_encoder = ExtendedLabelEncoder1(loaded_custom_dataset.user_encoder)
        self.item_encoder = ExtendedLabelEncoder2(loaded_custom_dataset.item_encoder)
        self.gender_encoder = ExtendedLabelEncoder1(loaded_custom_dataset.gender_encoder)
        self.prepare_data()
        self.graphs = []
        self.create_individual_graphs()
        self.pyg_graphs = []
        self.create_pyg_list()

    def prepare_data(self):
        self.df['user_index'] = self.user_encoder.transform(self.df['TRAVEL_ID'],unknown_label=-1)
        self.df['item_index'] = self.item_encoder.transform(self.df['VISIT_AREA_NM'])
        self.df['GENDER_index'] = self.gender_encoder.transform(self.df['GENDER'],unknown_label=-1)

    def create_individual_graphs(self):
        for _, group in self.df.groupby('TRAVEL_ID'):
            G = nx.Graph()
            user_index = group['user_index'].iloc[0]
            user_attributes = group.iloc[0][['GENDER_index', 'AGE_GRP', 'FAMILY_MEMB', 'TRAVEL_COMPANIONS_NUM']].to_dict()
            G.add_node(user_index, **user_attributes, type='user')
            for _, row in group.iterrows():
                item_index = row['item_index']
                G.add_node(item_index, type='item', name=row['VISIT_AREA_NM'])
                edge_attributes = row[['RESIDENCE_TIME_MIN', 'DGSTFN', 'REVISIT_INTENTION', 'RCMDTN_INTENTION']].to_dict()
                G.add_edge(user_index, item_index, **edge_attributes)
            self.graphs.append(G)

    def graph_to_pygdata(self, G):
        node_features, node_labels, edge_index, edge_attr = [], [], [], []
        node_index_mapping = {node: i for i, (node, _) in enumerate(G.nodes(data=True))}
        for node, attr in G.nodes(data=True):
            if 'type' in attr and attr['type'] == 'user':
                node_features.append([attr['GENDER_index'], attr['AGE_GRP'], attr['FAMILY_MEMB'], attr['TRAVEL_COMPANIONS_NUM']])
                node_labels.append(-1)
            else:
                node_features.append([0, 0, 0, 0])
                node_labels.append(self.item_encoder.transform([attr['name']])[0])
        for source, target, attr in G.edges(data=True):
            edge_index.append([node_index_mapping[source], node_index_mapping[target]])
            edge_attr.append([attr['RESIDENCE_TIME_MIN'], attr['DGSTFN'], attr['REVISIT_INTENTION'], attr['RCMDTN_INTENTION']])
        data = Data(x=torch.tensor(node_features, dtype=torch.float),
                    edge_index=torch.tensor(edge_index, dtype=torch.long).t().contiguous(),
                    edge_attr=torch.tensor(edge_attr, dtype=torch.float),
                    y=torch.tensor(node_labels, dtype=torch.long))
        return data

    def create_pyg_list(self):
        for G in self.graphs:
            self.pyg_graphs.append(self.graph_to_pygdata(G))


In [23]:
# 웹에서 받은 데이터 예제
raw_data = """TRAVEL_ID,VISIT_AREA_NM,RESIDENCE_TIME_MIN,DGSTFN,REVISIT_INTENTION,RCMDTN_INTENTION,GENDER,AGE_GRP,FAMILY_MEMB,TRAVEL_COMPANIONS_NUM
d_d000249,디앤디파트먼트 제주,60.0,5.0,5.0,5.0,남,30,1,3
d_d000249,제주동문시장,30.0,5.0,5.0,5.0,남,30,1,3
"""
df1 = pd.read_csv(io.StringIO(raw_data))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = r"best_check\checkpoint_epoch_1_3rd.pth"  # 경로 구분자 수정

# 체크포인트 로드 및 모델 상태 사전 추출
checkpoint = torch.load(model_path, map_location=device)
model = m2_model(num_node_features=4, num_classes=41476).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

graph_data = GraphDataU(df1)


# loaded_custom_dataset에서 GraphData 속성을 추출
print("변환완료")
recommendations = []

data = graph_data.pyg_graphs[0].to(device)
with torch.no_grad():
    print("모델 돌아가는 중")
    output = model(data)
    print("결과")

predicted_item_index = output.argmax(dim=1).cpu().numpy()
predicted_item_names = loaded_custom_dataset.item_encoder.inverse_transform(predicted_item_index)

# 모든 추천 항목 출력 (중복 포함)
print(f"All recommended items for the user: {predicted_item_names}")


변환완료
모델 돌아가는 중
결과
All recommended items for the user: ['서귀포매일올레시장' '서귀포매일올레시장' '서귀포매일올레시장']
