In [54]:
import random
import pandas as pd
import json

src_ratings = pd.read_csv(f'mindreader/ratings.csv')
src_ratings = src_ratings[src_ratings.sentiment != 0]

user_ratings = src_ratings[(src_ratings.sentiment == 1) & (src_ratings.isItem)][['uri', 'userId']].groupby('userId').count()
user_ratings.columns = ['pos_ratings']

src_ratings = src_ratings.merge(user_ratings, on='userId')
src_ratings = src_ratings[src_ratings.pos_ratings > 2]

In [55]:
from generic_data_loader import Rating

entity_idx = dict()
entity_count = 0

user_idx = dict()
user_count = 0

ratings = list()

# Convert rows to ratings
for index, row in src_ratings.iterrows():
    if row.uri not in entity_idx:
        entity_idx[row.uri] = entity_count
        entity_count += 1
        
    if row.userId not in user_idx:
        user_idx[row.userId] = user_count
        user_count += 1
    
    rating = Rating(user_idx[row.userId], entity_idx[row.uri], row.sentiment, row.isItem)
    ratings.append(rating)

item_indices = set()
for rating in ratings:
    if rating.is_movie_rating:
        item_indices.add(rating.e_idx)

print(f'{len(item_indices)=}')
print(f'{len(ratings)=}')

len(item_indices)=3013
len(ratings)=40731


In [56]:
de_training = list()
movie_training = list()
test = list()
validation = list()

def convert(rating_list):
    return [{"u_idx": rating.u_idx,
             "e_idx": rating.e_idx,
             "rating": rating.rating,
             "is_movie_rating": rating.is_movie_rating} for rating in rating_list]

for user in user_idx.values():
    user_ratings = [rating for rating in ratings if rating.u_idx == user]
    
    interacted_items = [rating.e_idx for rating in user_ratings if rating.is_movie_rating]
    uninteracted_items = item_indices.difference(set(interacted_items))
    
    positive_interacted_items = [rating.e_idx for rating in user_ratings if rating.is_movie_rating and rating.rating == 1]
    samples = random.sample(positive_interacted_items, 2)
    val_sample, test_sample = samples
    assert val_sample != test_sample
    assert val_sample in interacted_items
    assert test_sample in interacted_items
    
    train = [rating for rating in user_ratings if rating.e_idx not in samples]
    assert len(train) + 2 == len(user_ratings)
    assert val_sample not in [rating.e_idx for rating in train]
    assert test_sample not in [rating.e_idx for rating in train]
    
    train_movies_only = [rating for rating in train if rating.is_movie_rating]
    
    # Add to lists
    test.append((user, (test_sample, list(uninteracted_items))))
    validation.append((user, (val_sample, list(uninteracted_items))))
    de_training.append((user, convert(train)))
    movie_training.append((user, convert(train_movies_only)))
    
with open('new_data/with/0.json', 'w') as fp:
    json.dump({'training': de_training, 'testing': test, 'validation': validation}, fp)
    
with open('new_data/without/0.json', 'w') as fp:
    json.dump({'training': movie_training, 'testing': test, 'validation': validation}, fp)

print('done')

done


In [57]:
# Dump meta
with open('new_data/meta.json', 'w') as fp:
    json.dump({'e_idx_map': entity_idx}, fp)
