In [1]:
## 패키지 로드 및 데이터 불러오기.
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import os, random


import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F


#from . import Metric

## 경로 설정
data_path = '../../data'
saved_path = '../saved'
output_path = '../submission'


## 데이터 불러오기 
# 시청 시작/종료 데이터
history_df = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
watch_df = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')

# 구매/검색 데이터
buy_df = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8')
search_df = pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')

# 콘텐츠(아이템) 메타정보 데이터
meta_df = pd.read_csv(os.path.join(data_path, 'meta_data.csv'), encoding='utf-8')
meta_plus_df = pd.read_csv(os.path.join(data_path, 'meta_data_plus.csv'), encoding='utf-8')

# 유저 프로필 데이터
profile_df = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')

# 제출 파일 데이터
submission_df = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'), encoding='utf-8')

In [2]:
## 기본 설정
# 하이퍼파라미터 
class cfg: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25
    seed = 42
    neg_ratio = 100 # 100

# MF 모델 하이퍼 파라미터 설정 
cfg.batch_size = 2048
cfg.emb_dim = 64 # 임베딩 레이어
cfg.epochs = 5
cfg.learning_rate = 0.01
cfg.num_layers = 3 # LightGCN 레이어 수

# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(cfg.seed)

In [3]:
## 데이터 중복제거 및 날짜 기준 valid 분할.

# history_df 테이블 중복제거
data = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)
# 의미 없는 meta 내 소제목 날려서 중복 제거하기.
meta_df = meta_df.drop_duplicates(subset = ['album_id', 'title']).reset_index(drop = True)
# 존재하는 값이기 때문에 1 부여
data['rating'] = 1

In [4]:
## LightGCN 모델에 넣기 전 인덱싱 하는 부분.
# id2idx, user/item id를 넣으면 인덱스가 나오는 딕셔너리.

# user/item id은 동일한 값이 있으면 안되기 때문에 더해줌.
data['profile_id'] = data['profile_id'] + meta_df['album_id'].nunique()

user_id = list(data['profile_id'].unique())
item_id = list(data['album_id'].unique())

n_user = len(user_id)
n_item = len(item_id)

id2idx = {v:k for k,v in enumerate(user_id)}
item2idx = {v:k+n_user for k,v in enumerate(item_id)}

id2idx.update(item2idx)
idx2id = {}
for k, v in id2idx.items():
    idx2id[v] = k

data = data[['profile_id', 'album_id', 'rating']]

data['profile_id'] = data['profile_id'].map(id2idx)
data['album_id'] = data['album_id'].map(id2idx)

data.drop_duplicates(subset = ['profile_id', 'album_id'], inplace = True)

In [5]:
## 네거티브 샘플링을 진행하는 부분.

# 유저 단위로 neg_ratio 배 만큼 네거티브 샘플링을 해줌.
for user in data['profile_id'].unique():
    n_u = len(data[data['profile_id'] == user])
    rand_neg_item = np.random.choice(np.arange(n_user, n_user+n_item), n_u * cfg.neg_ratio, replace=True)
    cnt =  n_u * cfg.neg_ratio
    data = data.append(pd.DataFrame({'profile_id' : [user] * cnt, 'album_id' : rand_neg_item, 'rating' : [0] * cnt}))

# 만약 본 아이템을 네거티브 샘플링을 한 경우에 이를 방지하는 코드.
data.drop_duplicates(subset = ['profile_id', 'album_id'], keep ='first', inplace = True)

data['profile_id'] = data['profile_id'].astype('int')
data['rating'] = data['rating'].astype('int')

In [6]:
# LightGCN을 적용하기 위해 edge, label을 만드는 코드. 

edge, label = [], []
for user, item, acode in zip(data.profile_id, data.album_id, data.rating):
    edge.append([user, item])
    label.append(acode)

edge = torch.LongTensor(edge).T
label = torch.LongTensor(label)

In [7]:
## 라이브러리에서 모델 불러오기.

from torch_geometric.nn.models import LightGCN

model = LightGCN(n_user + n_item, embedding_dim=cfg.emb_dim, num_layers = cfg.num_layers) # 3
model.to(cfg.device)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate) # 0.01

batch_size = cfg.batch_size

In [8]:
## 모델 학습 부분.

for e in range(cfg.epochs): # 5
    idx = np.arange(len(edge[0]))
    idx = np.random.permutation(idx)
    t_edge = edge[:,idx]
    t_label = label[idx]

    for i in range(len(t_edge[0]) // batch_size):
        tem = min(batch_size * (i+1), len(t_edge[0]))

        b_edge = t_edge[:,batch_size*i:tem].to(cfg.device)
        b_label = t_label[batch_size*i:tem].to(cfg.device)

        pred = model(b_edge)

        # https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/models/lightgcn.html
        loss = model.link_pred_loss(pred, b_label)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    with torch.no_grad():

        print(
            f" * In epoch {(e+1):04}, loss={loss:.03f}"
        )


 * In epoch 0001, loss=0.415
 * In epoch 0002, loss=0.346
 * In epoch 0003, loss=0.275
 * In epoch 0004, loss=0.227
 * In epoch 0005, loss=0.165


In [9]:
# LightGCN 모델로 임베딩 레이어 뽑아내기.

model.eval()
tem = model.embedding(torch.LongTensor(data['profile_id'].unique()).to(cfg.device))
tem = tem.detach().cpu().numpy()
emb = pd.DataFrame(tem)
emb['id'] = data['profile_id'].unique()
emb['id'] = emb['id'].map(idx2id) - meta_df['album_id'].nunique()
emb.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,id
0,0.468873,1.541381,-0.494797,-0.746006,0.326949,-0.520875,1.683965,-0.344241,0.00109,1.008907,...,0.544517,0.163106,2.429857,-0.93231,-0.5997,-2.560289,-0.396309,-0.551025,2.371158,3
1,-3.219058,-2.804866,-2.064545,-1.484877,0.744984,1.230515,0.091342,0.102086,1.404219,0.69565,...,3.556404,-1.442405,0.738155,-2.202157,3.540129,0.858297,0.278805,-4.031192,1.51955,5
2,-0.900335,-0.813553,-1.217785,-0.542733,-0.718436,1.004176,1.113312,-1.137518,1.151592,1.097646,...,1.335576,-1.149408,0.218315,-0.762469,0.541913,0.478088,0.185284,-1.04254,-0.517281,7


In [10]:
def embedding(x):
    tmp=[]
    for i in range(64):
        tmp.append(float(x[i]))
    return tmp

emb['emb']=emb.apply(lambda x: embedding(x), axis=1)
gcn=emb[['id', 'emb']]

emb_gcn=[]
for idx,row in gcn.iterrows():
    emb_gcn.append(row['emb'])
emb_gcn=np.array(emb_gcn)
idx2gcn={idx:i for idx,i in enumerate(gcn['id'].unique())}

In [11]:
from sklearn.cluster import KMeans

num_clusters=12

Kmean = KMeans(n_clusters=num_clusters, random_state=42)
ret=Kmean.fit_predict(emb_gcn)

tmp=[[] for _ in range(num_clusters)]
for idx,label in enumerate(ret):
    tmp[label].append(idx2gcn[idx])

group=pd.DataFrame({'idx':[i for i in range(num_clusters)],'label':tmp})

for i in range(num_clusters):
    print(len(tmp[i]))

1319
723
510
1107
446
590
328
1076
248
588
284
1092


In [12]:
idx2user={idx:user for idx,user in enumerate(list(gcn['id']))}
user2idx={user:idx for idx,user in enumerate(list(gcn['id']))}

idx2label={idx:-1 for idx in range(8311)} # idx2label 인덱스 to 라벨
for idx,row in group.iterrows():
    for num in row['label']:
        idx2label[user2idx[num]]=row['idx']

In [13]:
data = history_df[['profile_id', 'log_time', 'album_id']].drop_duplicates(subset=['profile_id', 'album_id', 'log_time']).sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)

dat = data.groupby(['profile_id']).apply(lambda x : x['album_id'].value_counts()).reset_index()
dat = dat.groupby('profile_id')['level_1'].unique().to_frame().reset_index()
dat['label'] = dat['profile_id'].map(user2idx).map(idx2label)
data['label'] = data['profile_id'].map(user2idx).map(idx2label)

top25=[[] for _ in range(num_clusters)]
for i in range(num_clusters):
    top25[i] = list(data[data['label']==i][['album_id', 'profile_id']].groupby('album_id').nunique().sort_values('profile_id', ascending=False)[:25].index)

dat['pred'] = dat.apply(lambda x : top25[x['label']], axis = 1)

In [14]:
def recallk(actual, predicted, k = 25):
    """ label과 prediction 사이의 recall 평가 함수 
    Args:
        actual : 실제로 본 상품 리스트
        pred : 예측한 상품 리스트
        k : 상위 몇개의 데이터를 볼지 (ex : k=5 상위 5개의 상품만 봄)
    Returns: 
        recall_k : recall@k 
    """ 
    set_actual = set(actual)
    recall_k = len(set_actual & set(predicted[:k])) / min(k, len(set_actual))
    return recall_k

dat['recall1'] = dat.apply(lambda x : recallk(x['level_1'], x['pred']), axis = 1)

In [15]:
for i in range(num_clusters):
    print(i, dat[dat['label'] == i]['recall1'].mean())

0 0.26261650896987454
1 0.3809833228496575
2 0.1271949796225884
3 0.26687736440221665
4 0.30722073005774564
5 0.18920250192560037
6 0.2715376979694655
7 0.301298210057358
8 0.31548956260222244
9 0.17279850682671022
10 0.16436678926228235
11 0.27753234219988354


In [16]:
dat['predicted_list'] = dat.apply(lambda x : (list(x['level_1']) + x['pred'])[:25], axis = 1)
dat[['profile_id', 'predicted_list']].to_csv(os.path.join(output_path, 'Light_GCN_rul.csv'), index = False)