In [9]:
import json
import pandas as pd
import numpy as np

seed = 42
rand = np.random.RandomState(seed)
num_negatives = 100

def write_csv(file_name, frame):
    with open(file_name, 'w') as fp:
        fp.write(frame.to_csv())
        
def write_ratings(file, df):
    with open(file, 'w') as fp:
        for index, row in df.iterrows():
            fp.write(f'{row.userId}\t{row.uri}\t{1.0}\n')

def write_negatives(file, test_df, main_df):
    with open(file, 'w') as fp:
        for index, row in test_df.iterrows():
            seen_uris = set(main_df[main_df.userId == row.userId].uri)
            seen_uris.add(row.uri)
            
            negatives = main_df[(~main_df.uri.isin(seen_uris)) & main_df.isItem].uri
    
            sampled = '\t'.join([str(sample) for sample in rand.choice(negatives, num_negatives)])
            
            fp.write(f'({row.userId},{row.uri})\t{sampled}\n')

# mr = pd.read_csv('movielens.csv')
mr = pd.read_csv('../data/mindreader/ratings.csv')

# Load entities
entity_names = dict()
with open('../data/mindreader/entities_clean.json', 'r') as file:
    data = json.load(file)
    
    for uri, name, labels in data:
        entity_names[uri] = name

# Remove entities not in KG
mr = mr[mr.uri.isin(set(entity_names.keys()))]

# Remove unknown ratings
mr = mr[(mr.sentiment == 1) & mr.isItem]

# Remove URIs with 1 rating
df = mr[['userId', 'uri']].groupby('uri').count()
df.columns = ['uri_ratings']

mr = mr.merge(df[df['uri_ratings'] > 1], on='uri')

# Remove users with 1 item rating
df = mr[mr.isItem][['userId', 'uri']].groupby('userId').count()
df.columns = ['item_ratings']

mr = mr.merge(df[df['item_ratings'] > 1], on='userId')

for col in ('uri', 'userId'):
    mr[col] = mr[col].astype('category')

columns = mr.select_dtypes(['category']).columns
mr[columns] = mr[columns].apply(lambda x: x.cat.codes)

# Dump descriptive entity IDs, so they can be ignored for evaluation
json.dump([int(val) for val in mr[~mr.isItem].uri.unique()], open('entity_ids.json', 'w'))

test = mr.groupby('userId').apply(lambda df: df[df.isItem].sample(1, random_state=seed))
test_indices = set(test['Unnamed: 0'])
train = mr[~mr['Unnamed: 0'].isin(test_indices)]

write_ratings('ml-1m.train.rating', train)
write_ratings('ml-1m.test.rating', test)
write_negatives('ml-1m.test.negative', test, mr)

write_csv('../data/mindreader/mr_train.csv', train)
write_csv('../data/mindreader/mr_test.csv', test)

print(f'Wrote all files, {len(train)} training samples, {len(test)} test samples')

Wrote all files, 16449 training samples, 1592 test samples


In [62]:
from scipy import sparse
from random import sample

user_vectors = []

for user in train.userId.unique():
    user = int(user)
    user_vector = np.zeros(max(train.uri) + 1)
    for idx, rating in train[train.userId == user].iterrows():
        user_vector[rating.uri] = 1
    
    user_vectors.append(user_vector)

count, hits = 0, 0

df = train[['userId', 'uri']].groupby('uri').count()
df.columns = ['count']
top_k = list(df.sort_values(by='count', ascending=False).index)[:10]

for index, row in test.iterrows():
    if row.uri in top_k:
        hits += 1
    
    count += 1

print(f'{hits / count * 100}')

print(user_vectors)



8.013937282229964
[array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([1., 0., 1., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([1., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 1., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([1., 1., 1., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0.,