In [1]:
import pandas as pd
import numpy as np
import torch 
from sklearn.preprocessing import LabelEncoder
from torch.nn import Embedding
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
dataset = pd.read_csv("./data/prepro_train_data.csv")

# Make Graph Component

In [4]:
age_tensor = torch.tensor(dataset['Age'].values, dtype=torch.float32).unsqueeze(1)

### Location
# LabelEncoder를 사용해 위치 정보를 정수로 변환
le = LabelEncoder()
dataset['Location_encoded'] = le.fit_transform(dataset['Location'])

# 임베딩 레이어 초기화
embedding_layer = Embedding(num_embeddings=151, embedding_dim=79)

# 위치 정보를 10차원 벡터로 변환
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Location_encoded'].values, dtype=torch.long), dim=1))

# 결과를 NumPy array로 변환
location_embeddings = location_embeddings.detach().numpy().squeeze()


In [5]:
import fasttext
# FastText 모델 로드
fasttext_model = fasttext.load_model("cc.en.300.bin")

def get_title_embedding_fasttext(title):
    if not isinstance(title, str):
        title = ""
    words = title.split()
    if len(words) == 0:
        return np.zeros(300)
    embeddings = [fasttext_model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)
title_embeddings = dataset['Book-Title'].apply(get_title_embedding_fasttext).tolist()



In [6]:
from sklearn.decomposition import PCA

# PCA 객체를 생성
pca = PCA(n_components=50)

# 평균 임베딩 벡터로 구성된 리스트를 NumPy 배열로 변환
title_embeddings_array = np.array(title_embeddings)

# 차원 축소
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

In [7]:
### Publisher
# LabelEncoder를 사용해 위치 정보를 정수로 변환
le = LabelEncoder()
dataset['Publisher_encoded'] = le.fit_transform(dataset['Publisher'])

# 임베딩 레이어 초기화
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=30)

# 위치 정보를 10차원 벡터로 변환
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Publisher_encoded'].values, dtype=torch.long), dim=1))

# 결과를 NumPy array로 변환
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

### User-ID & Book-ID

In [8]:
### User-ID & Book-ID

# User-ID 열의 unique한 값들을 리스트로 만들기  > 83256
unique_user_ids = dataset['User-ID'].unique().tolist()
# Book-ID 열의 unique한 값들을 리스트로 만들기 > 243441
unique_book_ids = dataset['Book-ID'].unique().tolist()

# unique_user_ids 리스트를 기반으로 DataFrame 생성
unique_user_ids_df = pd.DataFrame(unique_user_ids, columns=['User-ID'])
# 내림차순 정렬
sorted_unique_user_ids_df = unique_user_ids_df.sort_values(by='User-ID', ascending=True)
# # 인덱스를 새 column으로 추가
sorted_unique_user_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_user_ids_df.rename(columns={'index': 'UserNodeID'}, inplace=True)

# unique_book_ids 리스트를 기반으로 DataFrame 생성
unique_book_ids_df = pd.DataFrame(unique_book_ids, columns=['Book-ID'])
# 내림차순 정렬
sorted_unique_book_ids_df = unique_book_ids_df.sort_values(by='Book-ID', ascending=True)
# #인덱스를 새 column으로 추가
sorted_unique_book_ids_df.reset_index(inplace=True, drop=True)
sorted_unique_book_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_book_ids_df.rename(columns={'index': 'BookNodeID'}, inplace=True)

# 인덱스 시작 번호를 83256으로 설정
sorted_unique_book_ids_df['BookNodeID'] += 83256

# UserNodeID  0 ~ 83255
# BookNodeID   83256 ~ 326696

In [9]:
def make_ID_dict(df):
    ID_dict = {}
    for index, row in df.iterrows():
        ID_dict[row[1]] = row[0]
    return ID_dict

UserNodeID_dict = make_ID_dict(sorted_unique_user_ids_df)
BookNodeID_dict = make_ID_dict(sorted_unique_book_ids_df)

####### 매핑 진행
dataset['User-ID'] = dataset['User-ID'].map(UserNodeID_dict)
dataset['Book-ID'] = dataset['Book-ID'].map(BookNodeID_dict)

## node feature


In [10]:
num_user_nodes = len(dataset['User-ID'].unique().tolist())
num_book_nodes = len(dataset['Book-ID'].unique().tolist())

user_ids = dataset['User-ID'].unique().tolist()  # 유니크 처리함
book_ids = dataset['Book-ID'].unique().tolist()

In [11]:
num_users = max(user_ids) + 1
num_books = max(book_ids) - num_user_nodes + 1

feature_dim = 80  # age (1) + location (29)  # title (20) + publisher (10)

user_features = np.zeros((num_user_nodes, feature_dim))
book_features = np.zeros((num_book_nodes, feature_dim))

In [12]:
from sklearn.preprocessing import StandardScaler

# user_id와 book_id는 이미 정수로 매핑되어 있다고 가정
scaler = StandardScaler()

# 스케일링을 개별적으로 수행
age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
location_embeddings_scaled = scaler.fit_transform(location_embeddings)

for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
    user_features[user_id] = np.concatenate([age, location], axis=0)

title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)

for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
    book_features[book_id - num_user_nodes] = np.concatenate([title, publisher], axis=0)
    
# 사용자 노드 행렬과 도서 노드 행렬을 세로로 연결하여 최종 node_features 행렬 생성:    
node_features = np.vstack((user_features, book_features))

In [13]:
from torch_geometric.data import Data

node_feature_matrix = torch.tensor(node_features, dtype=torch.float)
edge_attr = torch.tensor(dataset['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)
edge_index = torch.tensor(dataset[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

data = Data(x=node_feature_matrix,
            edge_index=edge_index,
            edge_attr=edge_attr)

In [14]:
data.to(device)

Data(x=[326697, 80], edge_index=[2, 871393], edge_attr=[871393, 1])

In [15]:
train_user_ids = np.unique(dataset['User-ID'].values)
train_book_ids = np.unique(dataset['Book-ID'].values)
train_node_ids = np.concatenate((train_user_ids, train_book_ids))
train_idx = torch.tensor(train_node_ids, dtype=torch.long)

# GraphSAGE

In [16]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class CustomSAGEConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(CustomSAGEConv, self).__init__(aggr='mean')
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index, edge_attr):
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        edge_attr = torch.cat([edge_attr, torch.zeros(x.size(0), 1, device=edge_attr.device)], dim=0)  # Add zeros for self-loops
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)

    def message(self, x_j, edge_attr):
        return x_j * edge_attr.view(-1, 1)

    def update(self, aggr_out):
        return self.lin(aggr_out)

In [17]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()

        self.num_layers = num_layers

        self.convs = torch.nn.ModuleList()
        self.convs.append(CustomSAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(CustomSAGEConv(hidden_channels, hidden_channels))
        self.convs.append(CustomSAGEConv(hidden_channels, out_channels))

    def forward(self, x, adjs, edge_attrs, pos_pair=None, neg_pair=None):
        for i, (edge_index, edge_attr, size) in enumerate(zip(adjs, edge_attrs)):
            x_target = x[:size[1]]  # Target nodes are always placed first.
            x = self.convs[i](x, edge_index, edge_attr)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)

        if pos_pair is not None and neg_pair is not None:
            pos_out = x[pos_pair[0]], x[pos_pair[1]]
            neg_out = x[neg_pair[0]], x[neg_pair[1]]
            return pos_out, neg_out

        return x

In [18]:
import torch
from torch.nn import functional as F

def nce_loss(pos_out, neg_out, neg_sample_size):
    pos_out = pos_out.view(-1).unsqueeze(0)
    neg_out = neg_out.view(-1).unsqueeze(0)
    
    out = torch.cat((pos_out, neg_out), 1)
    out = F.log_softmax(out * 100, dim=1)
    return -out[0][0]

In [25]:
def train(data, model, optimizer, neg_sample_size, device, epochs):
    model = model.to(device)
    data = data.to(device)

    for epoch in range(epochs):
        model.train()

        total_nodes = data.x.size(0)

        # Positive pair
        pos_pair = data.edge_index

        # Negative sampling
        num_neg_samples = pos_pair.shape[1]
        neg_nodes = torch.randint(0, total_nodes, (num_neg_samples * 2,), dtype=torch.long, device=device)

        # Negative pair
        neg_pair = torch.stack([data.edge_index[0], neg_nodes[:num_neg_samples]])

        # Obtain the node embeddings
        pos_out, neg_out = model(data.x, data.edge_index, data.edge_attr, pos_pair, neg_pair)

        loss = nce_loss(pos_out, neg_out, neg_sample_size)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

In [26]:
data

Data(x=[326697, 80], edge_index=[2, 871393], edge_attr=[871393, 1])

In [27]:
from torch_geometric.data import NeighborSampler
from torch.optim import AdamW

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

# Model and optimizer
model = GraphSAGE(in_channels=data.num_node_features, hidden_channels=64, out_channels=128, num_layers=2).to(device)
optimizer = AdamW(model.parameters(), lr=0.01)

# Data loader
loader = NeighborSampler(edge_index=data.edge_index, sizes=[-1]*model.num_layers, batch_size=128)

# Train the model
epochs = 20
neg_sample_size = 5
train(data, model, optimizer, neg_sample_size, device, epochs)

# Extract embeddings
model.eval()
with torch.no_grad():
    x, _ = model(data.x, data.edge_index, data.edge_attr)

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
chat과 상의한 마지막 부분의 코드를 아직 반영 안 했다. 결국 edge_attr을 전달하는 부분에 대한 내용인데, 
이거 내가 이미 고민하고 해결한 내용이다. 겁먹을 거 하나도 없다. 

++ 1번 노트의 코드를 확인하고, 문제를 차근히 확인, 에러 해결
1. 완료된 코드를 라인별로 해석 요청
2. 빠르게 비지도학습으로 임베딩 구하기
3. 회귀모델 싹 뽑아서 캐글 식으로 앙상블하기 - LGBM 여러개 앙상블, (시간 나면 NCF)