In [64]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width: 75% !important; }</style>"))

import pandas as pd
import numpy as np

### 원 데이터
* user: user id
* item: artist id  
* plays: user가 artist를 play한 횟수   

In [3]:
df = pd.read_csv(file_path + '/usersha1-artmbid-artname-plays.tsv', delimiter='\t', header=None)

df = df.drop(df.columns[2], axis=1)
df.columns = ['user', 'item', 'plays']
df = df.dropna()
df = df.loc[df.plays != 0]

In [60]:
df.shape

(17309470, 3)

In [4]:
df.head()

Unnamed: 0,user,item,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,1099
1,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,897
2,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,717
3,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,706
4,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,691


In [7]:
print('user 수:', len(np.unique(list(df['user'])))) 
print('artist 수:', len(np.unique(list(df['item']))))

user 수: 358857
artist 수: 160112


### 분석 데이터
* 분석에 필요한 데이터 준비 

In [15]:
def prepare_analy_dataset(df):
    """
    데이터 로드 함수 
    
    uids: train user
    iids: train item
    users: 전체 user          
    items: 전체 item
    df_train: train data
    df_test: test data
    """
    # user 10000명 샘플링 
    unique_user_lst = list(np.unique(df['user'])) #358857명 
    sample_user_idx = np.random.choice(len(unique_user_lst), 10000, replace=False)
    sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx]
    
    df = df[df['user'].isin(sample_user_lst)]
    df = df.reset_index(drop=True)

    # 1명 이상의 artist데이터가 있는 user만 사용 
    df_count = df.groupby(['user']).count()
    df['count'] = df.groupby('user')['user'].transform('count')
    df = df[df['count'] > 1]

    # user, item 아이디 부여 
    df['user_id'] = df['user'].astype("category").cat.codes
    df['item_id'] = df['item'].astype("category").cat.codes

    # lookup 테이블 생성 
    item_lookup = df[['item_id', 'item']].drop_duplicates()
    item_lookup['item_id'] = item_lookup.item_id.astype(str)

    # train, test 데이터 생성 
    df = df[['user_id', 'item_id', 'plays']] 
    df_train, df_test = train_test_split(df)

    # 전체 user, item 리스트 생성 
    users = list(np.sort(df.user_id.unique())) 
    items = list(np.sort(df.item_id.unique())) 

    # train user, item 리스트 생성 
    rows = df_train['user_id'].astype(int)   
    cols = df_train['item_id'].astype(int)
    values = list(df_train.plays) 
    
    uids = np.array(rows.tolist())
    iids = np.array(cols.tolist())

    # 각 user마다 negative item 생성 
    df_neg = get_negatives(uids, iids, items, df_test)

    return uids, iids, df_train, df_test, df_neg, users, items, item_lookup

def get_negatives(uids, iids, items, df_test):
    """
    negative item 리스트 생성함수
    """
    negativeList = []
    test_u = df_test['user_id'].values.tolist() 
    test_i = df_test['item_id'].values.tolist() 
 
    test_ratings = list(zip(test_u, test_i)) # test (user, item)세트 
    zipped = set(zip(uids, iids))            # train (user, item)세트

    for (u, i) in test_ratings:
        
        negatives = []
        negatives.append((u, i))
        for t in range(100):
            j = np.random.randint(len(items))     # neg_item j 1개 샘플링 
            while (u, j) in zipped:               # j가 train에 있으면 다시뽑고, 없으면 선택 
                j = np.random.randint(len(items)) 
            negatives.append(j)
        negativeList.append(negatives) # [(0,pos), neg, neg, ...]

    df_neg = pd.DataFrame(negativeList)

    return df_neg

def mask_first(x):

    result = np.ones_like(x) 
    result[0] = 0  # [0,1,1,....]
    
    return result

def train_test_split(df):
    """
    train, test 나누는 함수
    """
    df_test = df.copy(deep=True)
    df_train = df.copy(deep=True)
    
    # df_test
    # user_id와 holdout_item_id(user가 플레이한 아이템 중 1개)뽑기 
    df_test = df_test.groupby(['user_id']).first() 
    df_test['user_id'] = df_test.index
    df_test = df_test[['user_id', 'item_id', 'plays']]
    df_test = df_test.reset_index(drop=True)
    
    # df_train 
    # user_id 리스트에 make_first()적용 
    mask = df.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool)
    df_train = df.loc[mask]  

    return df_train, df_test

def get_train_instances(uids, iids, num_neg, num_items):
    """
    모델에 사용할 train 데이터 생성 함수 
    """
    user_input, item_input, labels = [],[],[]
    zipped = set(zip(uids, iids)) # train (user, item) 세트

    for (u, i) in zip(uids, iids):
        
        # pos item 추가 
        user_input.append(u) #[u]
        item_input.append(i) #[pos_i]
        labels.append(1)     #[1]

        # neg item 추가 
        for t in range(num_neg):
            
            j = np.random.randint(num_items)     # neg_item j num_neg(4)개 샘플링
            while (u, j) in zipped:              # u가 j를 이미 선택했다면 
                j = np.random.randint(num_items) # 다시 샘플링 
                
            user_input.append(u) # [u1, u1,  u1,  ...]
            item_input.append(j) # [pos_i, neg_j1, neg_j2, ...]
            labels.append(0)     # [1, 0,  0,  ...]

    return user_input, item_input, labels


In [16]:
uids, iids, df_train, df_test, df_neg, users, items, item_lookup = prepare_analy_dataset(df)

### train 데이터

In [25]:
df_train.head(10)

Unnamed: 0,user_id,item_id,plays
1,0,43686,534
2,0,40839,301
3,0,27785,248
4,0,14992,240
5,0,31681,200
6,0,41188,194
7,0,27547,183
8,0,48035,172
9,0,6889,149
10,0,42137,144


In [28]:
df_train.shape

(471404, 3)

### 모델 train 데이터
df_train의 각 row (user_id, item_id)당 negative item을 num_neg개씩 랜덤으로 선택한다.   

In [41]:
user_input, item_input, labels = get_train_instances(uids, iids, num_neg=4, num_items=len(items))

#### 예시 

In [55]:
print('df_train의 첫번째 행: (user_id, item_id)=', (uids[0], iids[0])) 
print('df_train의 두번째 행: (user_id, item_id)=', (uids[1], iids[1])) 

df_train의 첫번째 행: (user_id, item_id)= (0, 43686)
df_train의 두번째 행: (user_id, item_id)= (0, 40839)


In [59]:
for i, (user_id, item_id, label) in enumerate(zip(user_input[0:10], item_input[0:10], labels[0:10])):
    if i==0 or i==5:
        print('(user_id, postive_item_id, label):', (user_id, item_id, label))
    else:
        print('(user_id, negative_item_id, label):', (user_id, item_id, label))

(user_id, postive_item_id, label): (0, 43686, 1)
(user_id, negative_item_id, label): (0, 4284, 0)
(user_id, negative_item_id, label): (0, 30458, 0)
(user_id, negative_item_id, label): (0, 4093, 0)
(user_id, negative_item_id, label): (0, 15591, 0)
(user_id, postive_item_id, label): (0, 40839, 1)
(user_id, negative_item_id, label): (0, 2061, 0)
(user_id, negative_item_id, label): (0, 43559, 0)
(user_id, negative_item_id, label): (0, 29172, 0)
(user_id, negative_item_id, label): (0, 32114, 0)


### test 데이터
각 user별로, user_id와 user가 플레이한 item(artist) 1개로 이뤄진다.  

In [29]:
df_test.head(10)

Unnamed: 0,user_id,item_id,plays
0,0,47912,690
1,1,45306,3478
2,2,38285,315
3,3,34421,1081
4,4,14240,163
5,5,37358,102
6,6,39910,202
7,7,34012,1015
8,8,34133,39655
9,9,38593,432


In [30]:
df_test.shape

(10000, 3)

### df_neg 데이터
각 user별로, negative item(user가 플레이 하지 않은 item(artist)) 100개를 랜덤으로 선택한다.  
* column 0: df_test 데이터의 (user_id, item_id)  
* column 1~100: negative item     

df_neg 데이터는 모델 평가시 TOP-K metric을 계산할 때 사용된다.     

In [32]:
df_neg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,"(0, 47912)",15538,13645,27015,37532,19044,15700,36803,3452,39381,...,23180,41660,25694,26889,24163,49302,3171,46151,19003,27326
1,"(1, 45306)",38940,29659,31319,32218,24843,7864,7663,14334,18938,...,44109,38467,45471,34541,36679,29502,42367,10750,26569,25374
2,"(2, 38285)",28492,6480,4062,35839,20446,44947,33852,6137,45770,...,38495,32714,43861,46410,45033,5332,22047,42698,23249,15833
3,"(3, 34421)",10044,16773,14762,33474,26398,2561,21464,31421,11798,...,9679,41737,17012,3033,4590,19763,23572,35607,27029,19199
4,"(4, 14240)",26861,21718,42307,19718,11827,7268,20855,7697,18883,...,32455,7677,23137,44303,23418,37859,16071,32773,29910,3079


In [58]:
df_neg.shape

(10000, 101)