In [2]:
import json
import os
import pandas as pd
import numpy as np

def write_csv():
    with open('movielens.csv', 'w') as fp:
        fp.write(ratings.to_csv())

# Load entities
entity_labels = dict()
entity_names = dict()
with open('../data/mindreader/entities_clean.json', 'r') as file:
    data = json.load(file)
    
    for uri, name, labels in data:
        entity_labels[uri] = set(labels.split('|'))
        entity_names[uri] = name

# Load MovieLens ratings
DATA_PATH = '../data'
ml_path = os.path.join(DATA_PATH, 'movielens')

# Load from CSV
movies = pd.read_csv(f'{ml_path}/movies.csv')
ratings = pd.read_csv(f'{ml_path}/ratings.csv')
links = pd.read_csv(f'{ml_path}/links.csv')
mapping = pd.read_csv(f'{ml_path}/mapping.csv')

# Restrict mapping to URIs available in the KG 
uris = set(entity_names.keys())
mapping = mapping[mapping.uri.isin(uris)]

# Merge with mappings
movies = movies.merge(links, on='movieId')
movies.imdbId = movies.imdbId.map(lambda item: f'tt{str(item).zfill(7)}')
movies = movies.merge(mapping, on='imdbId')

# Merge ratings
ratings_before = len(ratings)
ratings = ratings.merge(movies, on='movieId')
ratings['isItem'] = True
print(f'Ratings removed: {ratings_before - len(ratings)}')

# Drop stuff we don't need
ratings.drop(["movieId", "genres", "tmdbId", "imdbId", "timestamp"], axis=1, inplace=True)
ratings.sort_values(by='userId', inplace=True)

print(ratings)

write_csv()

Ratings removed: 7751
       userId  rating                                           title  \
0           1     2.5                          Dangerous Minds (1995)   
442         1     2.5            Star Trek: The Motion Picture (1979)   
259         1     2.0                                  Ben-Hur (1959)   
165         1     4.0  Cinema Paradiso (Nuovo cinema Paradiso) (1989)   
305         1     2.0                                   Gandhi (1982)   
...       ...     ...                                             ...   
56230     671     4.0               O Brother, Where Art Thou? (2000)   
56267     671     4.0                            Thirteen Days (2000)   
27737     671     4.5                           Ocean's Eleven (2001)   
54017     671     4.0                                      JFK (1991)   
30767     671     4.0                              Chicken Run (2000)   

                                          uri  isItem  
0      http://www.wikidata.org/entity/Q579359