In [19]:
import argparse
import numpy as np
from tqdm import tqdm

RATING_FILE_NAME = dict({'movie': 'ratings_re2.csv', 'book': 'BX-Book-Ratings.csv', 'news': 'ratings.txt'})
ITEMS_FILE_NAME = dict({'movie': 'moviesIdx2.txt'})
SEP = dict({'movie': ',', 'book': ';', 'news': '\t'})
THRESHOLD = dict({'movie': 4, 'book': 0, 'news': 0})

In [20]:
def convert_rating():

    print('reading rating file ...')
    items_filename = ITEMS_FILE_NAME[DATASET]
    items = open(items_filename, encoding='utf-8').readlines()
    item_set = set(range(len(items)))

    user_pos_ratings = dict()
    user_neg_ratings = dict()

    file = RATING_FILE_NAME[DATASET]
    for line in open(file, encoding='utf-8').readlines():

        array = line.strip().split(SEP[DATASET])
        user_index = int(array[0])
        item_index = int(array[1])
        rating = float(array[2])

        # Separate positive & negative rated items
        if rating >= THRESHOLD[DATASET]:
            if user_index not in user_pos_ratings:
                user_pos_ratings[user_index] = set()
            user_pos_ratings[user_index].add(item_index)

        else:
            if user_index not in user_neg_ratings:
                user_neg_ratings[user_index] = set()
            user_neg_ratings[user_index].add(item_index)

    print('reading rating file success !')
    print('converting rating file ...')

    # Output file
    writer = open('ratings_final.txt', 'w', encoding='utf-8')
    for user_index, pos_item_set in tqdm(user_pos_ratings.items()):

        # Write positive sample
        for item in (pos_item_set):
            writer.write("{}\t{}\t1\n".format(user_index, item))

        # ! Negative sample using unwatched instead of negative rated movies !
        unwatched_set = item_set - pos_item_set
        if user_index in user_neg_ratings:
            unwatched_set -= user_neg_ratings[user_index]

        # Write negative sample (unwatched)
        for item in (np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False)):
            writer.write("{}\t{}\t0\n".format(user_index, item))

    writer.close()
    print('converting rating file success !')

In [21]:
def convert_kg():
    print('converting kg file ...')

    # Output file
    writer = open('kg_final.txt', 'w', encoding='utf-8')

    if DATASET == 'movie':
        raw_knowledge_graph = open('triples_idx2.txt', encoding='utf-8')
    else:
        raw_knowledge_graph = open('kg_rehashed.txt', encoding='utf-8')

    for line in raw_knowledge_graph:
        head, relation, tail = line.strip().split(' ')
        writer.write("{}\t{}\t{}\n".format(head, relation, tail))

    writer.close()
    print('converting kg file success !')

In [22]:
np.random.seed(555)

entity_id2index = dict()
relation_id2index = dict()
item_index_old2new = dict()

In [23]:
convert_rating()

reading rating file ...


  0%|          | 12/138152 [00:00<2:34:25, 14.91it/s]

reading rating file success !
converting rating file ...


100%|██████████| 138152/138152 [04:12<00:00, 546.76it/s]


converting rating file success !


In [24]:
convert_kg()

converting kg file ...
converting kg file success !
