In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
class DatasetLoader(object):
    def load(self):
        """Minimum condition for dataset:
          * All users must have at least one item record.
          * All items must have at least one user record.
        """
        raise NotImplementedError

class MovieLens1M(DatasetLoader):
    def __init__(self, data_dir):
        self.fpath = os.path.join(data_dir, 'ratings.dat')

    def load(self):
        # Load data
        df = pd.read_csv(self.fpath,
                         sep='::',
                         engine='python',
                         names=['user', 'item', 'rate', 'time'])
        # TODO: Remove negative rating?
        # df = df[df['rate'] >= 3]
        return df
    
def convert_unique_idx(df, column_name):
    column_dict = {x: i for i, x in enumerate(df[column_name].unique())}
    df[column_name] = df[column_name].apply(column_dict.get)
    df[column_name] = df[column_name].astype('int')
    assert df[column_name].min() == 0
    assert df[column_name].max() == len(column_dict) - 1
    return df, column_dict

def create_user_list(df, user_size):
    user_list = [list() for u in range(user_size)]
    for row in df.itertuples():
        user_list[row.user].append((row.time, row.item))
    return user_list

def split_train_test(df, user_size, test_size=0.2, time_order=False):
    """Split a dataset into `train_user_list` and `test_user_list`.
    Because it needs `user_list` for splitting dataset as `time_order` is set,
    Returning `user_list` data structure will be a good choice."""
    # TODO: Handle duplicated items
    if not time_order:
        test_idx = np.random.choice(len(df), size=int(len(df)*test_size))
        train_idx = list(set(range(len(df))) - set(test_idx))
        test_df = df.loc[test_idx].reset_index(drop=True)
        train_df = df.loc[train_idx].reset_index(drop=True)
        test_user_list = create_user_list(test_df, user_size)
        train_user_list = create_user_list(train_df, user_size)
    else:
        total_user_list = create_user_list(df, user_size)
        train_user_list = [None] * len(user_list)
        test_user_list = [None] * len(user_list)
        for user, item_list in enumerate(total_user_list):
            # Choose latest item
            item_list = sorted(item_list, key=lambda x: x[0])
            # Split item
            test_item = item_list[math.ceil(len(item_list)*(1-test_size)):]
            train_item = item_list[:math.ceil(len(item_list)*(1-test_size))]
            # Register to each user list
            test_user_list[user] = test_item
            train_user_list[user] = train_item
        
    # Remove time
    test_user_list = [list(map(lambda x: x[1], l)) for l in test_user_list]
    train_user_list = [list(map(lambda x: x[1], l)) for l in train_user_list]
    return train_user_list, test_user_list

In [3]:
ml1m = MovieLens1M('../data/ml-1m/')

In [30]:
data = pd.read_csv("../data/ml-1m/users.dat",sep='::',
                         engine='python',
                         names=['id', 'gender', 'age', 'job', 'zipcode'])

In [31]:
data

Unnamed: 0,id,gender,age,job,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [4]:
df = ml1m.load()
df, user_mapping = convert_unique_idx(df, 'user')
df, item_mapping = convert_unique_idx(df, 'item')

In [21]:
idx2uid = {}
for k, v in user_mapping.items():
    idx2uid[str(v)] = str(k)

In [22]:
import json

# create json object from dictionary
data = json.dumps(idx2uid)

# open file for writing, "w" 
f = open("idx2uid.json","w")
f.write(data)
f.close()

In [19]:
idx2iid = {}
for k,v in item_mapping.items():
    idx2iid[str(v)] = str(k)

In [20]:
# create json object from dictionary
data = json.dumps(idx2iid)

# open file for writing, "w" 
f = open("idx2iid.json","w")
f.write(data)
f.close()

In [56]:
interaction_list = create_user_list(df, 6040)

In [58]:
inter_count = []
for i in interaction_list:
    inter_count.append(len(i))

In [47]:
user_size = len(df['user'].unique())
item_size = len(df['item'].unique())

In [48]:
train_user_list, test_user_list = split_train_test(df, user_size, test_size=0.2, time_order=False)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [34]:
with open(f"../data/ml-1m/train.txt", "w") as f:
    for k, l in enumerate(train_user_list):
        f.write(f"{k}")
        for j in l:
            f.write(f" {j}")
        f.write("\n")
    f.close()

In [35]:
with open(f"../data/ml-1m/test.txt", "w") as f:
    for k, l in enumerate(test_user_list):
        f.write(f"{k}")
        for j in l:
            f.write(f" {j}")
        f.write("\n")
    f.close()