In [1]:
import pandas as pd
import random
import json
import numpy as np
import tqdm

random.seed(42)

# Ratio of users to consider in warm start setting
include_unknown = False
warm_start_ratio = 0.75

ratings = pd.read_csv(f'../sources/mindreader/ratings-100k.csv')
if not include_unknown:
    ratings = ratings[ratings.sentiment != 0]

# Compute ratings per entity
# In the future, this could be used for popularity sampling of negative samples
entity_ratings = ratings[['uri', 'userId']].groupby('uri').count()
entity_ratings.columns = ['num_ratings']

# Filter users with less than two positive movie samples
tmp = ratings[ratings.sentiment == 1 & ratings.isItem][['uri', 'userId']].groupby('userId').count()
tmp.columns = ['pos_ratings']

ratings = ratings[ratings.userId.isin(tmp[tmp.pos_ratings >= 2].index)]

# Partition into warm and cold start users
users = ratings['userId'].unique()
random.shuffle(users)

num_warm_start = int(len(users) * warm_start_ratio)
warm_start_users = set(users[:num_warm_start])
cold_start_users = set(users[num_warm_start:])

assert warm_start_users.isdisjoint(cold_start_users)

In [2]:
def sample_positive(from_ratings):
    return random.choice(from_ratings[from_ratings.sentiment == 1 & from_ratings.isItem].entityIdx.unique())

def sample_unseen_items(user_id, n_items=100):
    item_ratings = ratings[ratings.isItem]
    
    seen_items = set(item_ratings[item_ratings.userId == user_id].entityIdx.unique())
    unseen_items = list(set(item_ratings.entityIdx.unique()).difference(seen_items))
   
    random.shuffle(unseen_items)
    
    return unseen_items[:n_items]

def get_ratings_dict(from_ratings):
    return {row.entityIdx: row.sentiment for _, row in from_ratings.iterrows()}

def get_validation_dict(user_id, left_out):
    return {
        'positive': left_out,
        'negative': sample_unseen_items(user_id)
    }

# Map users and entities to indices
user_idx = {k: v for v, k in enumerate(users)}
entity_idx = {k: v for v, k in enumerate(ratings['uri'].unique())}

ratings['entityIdx'] = ratings.uri.transform(entity_idx.get)

# Find movie indices
movie_indices = set(ratings[ratings.isItem].entityIdx.unique())

# Generate training/validation data from warm start users
training_data = dict()

for user in tqdm.tqdm(warm_start_users):
    u_ratings = ratings[ratings.userId == user]
    
    val_sample = sample_positive(u_ratings)
    
    training_dict = get_ratings_dict(u_ratings[u_ratings.entityIdx != val_sample])
    validation_dict = get_validation_dict(user, val_sample)
    
    # Assert validation sample not in training
    assert val_sample not in training_dict.keys()
    
    # Assert positive sample not in negative samples
    assert val_sample not in validation_dict['negative']
    
    # Assert negative samples not in training
    assert not set(validation_dict['negative']).intersection(training_dict.keys())

    training_data[user_idx[user]] = {
        'training': training_dict,
        'validation': validation_dict
    }
    
# Generate testing data from cold start users
testing_data = dict()

for user in tqdm.tqdm(cold_start_users):
    u_ratings = ratings[ratings.userId == user]
    
    # Before exhaustive LOO, get validation sample
    val_sample = sample_positive(u_ratings)
    validation_dict = get_validation_dict(user, val_sample)
    
    # For convenience, leave out the validation sample from the user's ratings
    u_ratings = u_ratings[u_ratings.entityIdx != val_sample]
    
    # Find all the user's positive item ratings
    u_pos = u_ratings[u_ratings.isItem & u_ratings.sentiment == 1]
    assert len(u_pos)
    
    # For each positive item, create an answer set with that item left out
    sets = []
    for idx, pos in u_pos.iterrows():
        answer_dict = get_ratings_dict(u_ratings[u_ratings.entityIdx != pos.entityIdx])
        pos_neg_dict = get_validation_dict(user, pos.entityIdx)
        
        # Skip if user cannot provide any movie answers
        # By checking this, we can remove DEs from answer sets without losing users between comparisons
        if not set(answer_dict.keys()).intersection(movie_indices):
            continue
        
        # Assert that the positive item is not in the negative samples
        assert pos.entityIdx not in pos_neg_dict['negative']
        
        # Assert that user cannot answer about the positive item
        assert pos.entityIdx not in answer_dict
        
        sets.append({**pos_neg_dict, 'answers': answer_dict})
    
    # Check if user has any valid answer sets
    if not sets:
        continue
    
    testing_data[user_idx[user]] = {
        'sets': sets,
        'validation': validation_dict
    }
    
    
print(f'Created {len(training_data)} training entries and {len(testing_data)} testing entries')

100%|██████████| 853/853 [00:43<00:00, 19.65it/s]
100%|██████████| 285/285 [02:21<00:00,  2.02it/s]


Created 853 training entries and 281 testing entries


In [3]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

with open('data/training.json', 'w') as fp:
    json.dump(training_data, fp, cls=NpEncoder)
    
with open('data/testing.json', 'w') as fp:
    json.dump(testing_data, fp, cls=NpEncoder)

with open('data/meta.json', 'w') as fp:
    json.dump({
        'uri_idx': entity_idx,
        'idx_item': {row.entityIdx: row.isItem for idx, row in ratings.iterrows()}
    }, fp, cls=NpEncoder)

print('Dumped data')

Dumped data
