In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from collections import defaultdict
from tensorflow.keras.preprocessing.sequence import pad_sequences

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
file = '../dataset/ml-1m/ratings.dat'


In [3]:
print('==========Data Preprocess Start=============')
data_df = pd.read_csv(file, sep="::", engine='python',
                      names=['user_id', 'item_id', 'label', 'Timestamp'])

data_df



Unnamed: 0,user_id,item_id,label,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


# filtering

In [4]:
data_df['item_count'] = data_df.groupby('item_id')['item_id'].transform('count')
data_df

Unnamed: 0,user_id,item_id,label,Timestamp,item_count
0,1,1193,5,978300760,1725
1,1,661,3,978302109,525
2,1,914,3,978301968,636
3,1,3408,4,978300275,1315
4,1,2355,5,978824291,1703
...,...,...,...,...,...
1000204,6040,1091,1,956716541,373
1000205,6040,1094,5,956704887,1229
1000206,6040,562,5,956704746,478
1000207,6040,1096,4,956715648,344


In [5]:
data_df = data_df[data_df.item_count >= 5]
data_df

Unnamed: 0,user_id,item_id,label,Timestamp,item_count
0,1,1193,5,978300760,1725
1,1,661,3,978302109,525
2,1,914,3,978301968,636
3,1,3408,4,978300275,1315
4,1,2355,5,978824291,1703
...,...,...,...,...,...
1000204,6040,1091,1,956716541,373
1000205,6040,1094,5,956704887,1229
1000206,6040,562,5,956704746,478
1000207,6040,1096,4,956715648,344


# trans score

In [6]:
trans_score = 1
data_df = data_df[data_df.label >= trans_score]
data_df

Unnamed: 0,user_id,item_id,label,Timestamp,item_count
0,1,1193,5,978300760,1725
1,1,661,3,978302109,525
2,1,914,3,978301968,636
3,1,3408,4,978300275,1315
4,1,2355,5,978824291,1703
...,...,...,...,...,...
1000204,6040,1091,1,956716541,373
1000205,6040,1094,5,956704887,1229
1000206,6040,562,5,956704746,478
1000207,6040,1096,4,956715648,344


# sort

In [7]:
data_df = data_df.sort_values(by=['user_id', 'Timestamp'])
data_df

Unnamed: 0,user_id,item_id,label,Timestamp,item_count
31,1,3186,4,978300019,431
22,1,1270,5,978300055,2583
27,1,1721,4,978300055,1546
37,1,1022,5,978300055,577
24,1,2340,3,978300103,344
...,...,...,...,...,...
1000019,6040,2917,4,997454429,504
999988,6040,1921,4,997454464,613
1000172,6040,1784,3,997454464,1424
1000167,6040,161,3,997454486,760


# split dataset and negative sampling

In [14]:
test_neg_num = 100

train_data, val_data, test_data = defaultdict(list), defaultdict(list), defaultdict(list)
item_id_max = data_df['item_id'].max()
for user_id, df in tqdm(data_df[['user_id', 'item_id']].groupby('user_id')):
    pos_list = df['item_id'].tolist()

    def gen_neg():
        neg = pos_list[0]
        while neg in set(pos_list):
            neg = random.randint(1, item_id_max)
        return neg

    neg_list = [gen_neg() for i in range(len(pos_list) + test_neg_num)]
    for i in range(1, len(pos_list)):
        hist_i = pos_list[:i]
        if i == len(pos_list) - 1:
            test_data['user_id'].append(user_id)
            test_data['pos_id'].append(pos_list[i])
            test_data['neg_id'].append(neg_list[i:])
        elif i == len(pos_list) - 2:
            val_data['user_id'].append(user_id)
            val_data['pos_id'].append(pos_list[i])
            val_data['neg_id'].append(neg_list[i])
        else:
            train_data['user_id'].append(user_id)
            train_data['pos_id'].append(pos_list[i])
            train_data['neg_id'].append(neg_list[i])

100%|██████████| 6040/6040 [00:25<00:00, 240.28it/s]


# feature columns

In [18]:
def sparseFeature(feature, feature_num, embed_dim):
    """
    create dictionary for sparse feature
    :param feature_name: feature name
    :param feature_num: the total number of sparse features that do not repeat
    
    :return:
    """
    return {'feature_name': feature, 'feature_num': feature_num, 'embed_dim': embed_dim}

In [19]:
embed_dim = 32

user_num, item_num = data_df['user_id'].max() + 1, data_df['item_id'].max() + 1
item_feat_col = [sparseFeature('user_id', user_num, embed_dim),
                 sparseFeature('item_id', item_num, embed_dim)]

# shuffle

In [20]:
random.shuffle(train_data)
random.shuffle(val_data)
train = [np.array(train_data['user_id']), np.array(train_data['pos_id']),
         np.array(train_data['neg_id'])]
val = [np.array(val_data['user_id']), np.array(val_data['pos_id']),
       np.array(val_data['neg_id'])]
test = [np.array(test_data['user_id']), np.array(test_data['pos_id']),
        np.array(test_data['neg_id'])]

In [21]:
print('============Data Preprocess End=============')
print(item_feat_col)
print(f'train : {len(train)}')
print(f'val : {len(val)}')
print(f'test : {len(test)}')



[{'feature_name': 'user_id', 'feature_num': 6041, 'embed_dim': 32}, {'feature_name': 'item_id', 'feature_num': 3953, 'embed_dim': 32}]
train : 3
val : 3
test : 3


In [30]:
item_feat_col

[{'feature_name': 'user_id', 'feature_num': 6041, 'embed_dim': 32},
 {'feature_name': 'item_id', 'feature_num': 3953, 'embed_dim': 32}]

In [36]:
user_inputs, pos_inputs, neg_inputs = train

In [42]:
user_inputs.shape

(981491,)

In [43]:
pos_inputs.shape

(981491,)

In [44]:
neg_inputs.shape

(981491,)

In [45]:
pos_inputs

array([1270, 1721, 1022, ..., 2917, 1921, 1784])

In [46]:
neg_inputs

array([ 968, 3169, 3652, ...,  805, 2736, 1614])