In [1]:
# we already have training and testing data for the model
# need to create testing data for teh metrics

import os
import pandas as pd
import numpy as np
import random

random.seed(42)
dir = "ml-100k"

In [2]:
class Dataset:
    def __init__(self, dir):
        self.dataframe = self.load_dataframe(dir)

        self.user_item = {}
        self.item_user = {}

        self.train = []
        self.test = {}

    def load_dataframe(self, dir):
        data = pd.read_csv(os.path.join(dir, "u.data"), delimiter="\t", header=None)
        data.columns = ["user", "item", "score", "timestamp"]

        item = pd.read_csv(os.path.join(dir, "u.item"), delimiter="|", encoding = "ISO-8859-1", header=None)
        item.columns = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action','Adventure','Animation',"Children's",'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']

        user = pd.read_csv(os.path.join(dir, "u.user"), delimiter="|", header=None)
        user.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']

        data = data.join(user.set_index('user id'), on='user').join(item.set_index('movie id'), on='item')
        return(data)
    
    def gen_adjacency(self):
        for u, i in self.dataframe[['user', 'item']].to_numpy():
            self.user_item[u] = self.user_item.get(u, set())
            self.user_item[u].add(i)

            self.item_user[u] = self.item_user.get(i, set())
            self.item_user[u].add(u)

    def sample_positive(self, userid):
        return random.choice(tuple(self.user_item[userid]))

    def sample_negative(self, userid):
        i = random.choice(tuple(self.item_user.keys()))
        while i in self.user_item[userid]:
            i = random.choice(tuple(self.item_user.keys()))
        return i
        
    def make_train_test(self, neg_count=4):
        for u in self.user_item:
            leave_out = self.sample_positive(u)
            for i in self.user_item[u]:
                if i != leave_out:
                    self.train.append((u, i, 1))
            
            for i in self.user_item[u]:
                for _ in range(neg_count):
                    self.train.append((u, self.sample_negative(u), 0))

            self.test[u] = [leave_out, [self.sample_negative(u) for _ in range(100)]]

In [3]:
dataset = Dataset(dir)
display(dataset.dataframe)

Unnamed: 0,user,item,score,timestamp,age,gender,occupation,zip code,movie title,release date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,49,M,writer,55105,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,00000,L.A. Confidential (1997),01-Jan-1997,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,25,M,writer,40206,Heavyweights (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,28,M,technician,80525,Legends of the Fall (1994),01-Jan-1994,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,47,M,educator,55113,Jackie Brown (1997),01-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,476,3,880175444,13,M,student,83702,"First Wives Club, The (1996)",14-Sep-1996,...,0,0,0,0,0,0,0,0,0,0
99996,716,204,5,879795543,36,F,administrator,44265,Back to the Future (1985),01-Jan-1985,...,0,0,0,0,0,0,1,0,0,0
99997,276,1090,1,874795795,21,M,student,95064,Sliver (1993),01-Jan-1993,...,0,0,0,0,0,0,0,1,0,0
99998,13,225,2,882399156,47,M,educator,29206,101 Dalmatians (1996),27-Nov-1996,...,0,0,0,0,0,0,0,0,0,0


In [4]:
dataset.gen_adjacency()
dataset.make_train_test()
len(dataset.train), len(dataset.test)

(499057, 943)