In [80]:
# from flurs.datasets import fetch_movielens
from flurs.recommender import FMRecommender
from flurs.evaluator import Evaluator
from flurs.data.entity import User, Item, Event
import os
import time
import numpy as np
from calendar import monthrange
from datetime import datetime, timedelta
import csv

from sklearn.utils import Bunch

import os
import sys
import pandas as pd

In [3]:
# converting data into FluRS input object
data = fetch_movielens(data_home='data/ml-1m', size='1m')

# logging.info('initialize recommendation model and evaluation module')
rec = FMRecommender(p=sum(data.contexts.values()),  # number of dimensions of input vector
                    k=40,
                    l2_reg_w0=2.,
                    l2_reg_w=8.,
                    l2_reg_V=16.,
                    learn_rate=.004)
rec.initialize()
evaluator = Evaluator(rec, data.can_repeat)

n_batch_train = int(data.n_sample * 0.2)  # 20% for pre-training to avoid cold-start
n_batch_test = int(data.n_sample * 0.1)  # 10% for evaluation of pre-training
batch_tail = n_batch_train + n_batch_test

# pre-train
# 20% for batch training | 10% for batch evaluate
# after the batch training, 10% samples are used for incremental updating
# logging.info('batch pre-training before streaming input')
evaluator.fit(
    data.samples[:n_batch_train],
    data.samples[n_batch_train:batch_tail],
    n_epoch=1  # single pass even for batch training
)

# 70% incremental evaluation and updating
# logging.info('incrementally predict, evaluate and update the recommender')
res = evaluator.evaluate(data.samples[batch_tail:])

print(res)

<generator object Evaluator.evaluate at 0x7f63e31ccc78>


In [10]:
num_users = len(rec.users.keys())

In [48]:
def load_movies(data_home, size):
    """Load movie genres as a context.
    Returns:
        dict of movie vectors: item_id -> numpy array (n_genre,)
    """
    all_genres = ['Action',
                  'Adventure',
                  'Animation',
                  "Children",
                  'Comedy',
                  'Crime',
                  'Documentary',
                  'Drama',
                  'Fantasy',
                  'Film-Noir',
                  'Horror',
                  'Musical',
                  'Mystery',
                  'Romance',
                  'Sci-Fi',
                  'Thriller',
                  'War',
                  'Western',
                 'IMAX',
                 '(no genres listed)']
    n_genre = len(all_genres)

    movies = {}

    if size == 'latest-small':
        with open(os.path.join(data_home, 'movies.csv'), encoding='ISO-8859-1') as f:
            csv_reader = csv.DictReader(f)
            for row in csv_reader:
                item_id_str = row['movieId']
                title = row['title']
                genres = row['genres']

                movie_vec = np.zeros(n_genre)
                for genre in genres.split('|'):
                    i = all_genres.index(genre)
                    movie_vec[i] = 1.
                item_id = int(item_id_str)
                movies[item_id] = movie_vec
    else:
        print('ERROR')

    return movies

In [95]:
%%time
_ = load_movies(data_home=DATA_DIR, size='latest-small')

CPU times: user 269 ms, sys: 24 ms, total: 293 ms
Wall time: 290 ms


In [54]:
df = pd.read_csv(os.path.join(DATA_DIR, 'ratings.csv'), encoding='ISO-8859-1')

In [62]:
df['rating'] = [int(x) for x in df['rating']]

In [66]:
df.values

array([[         1,        307,          3, 1256677221],
       [         1,        481,          3, 1256677456],
       [         1,       1091,          1, 1256677471],
       ...,
       [    283228,      34405,          4, 1379882889],
       [    283228,      44761,          4, 1354159524],
       [    283228,      54286,          4, 1354159718]])

In [85]:
def load_ratings(data_home, size):
    df = pd.read_csv(os.path.join(data_home, 'ratings.csv'), encoding='ISO-8859-1')
    df['rating'] = [int(x) for x in df['rating']]
    users = sorted(list(set(df['userId'])))
    sorted_data = df.values[np.argsort(df.values[:, 3])]
    return (users, sorted_data)

In [104]:
_, data = load_ratings(DATA_DIR, size='latest-small')

In [105]:
data

array([[     56769,       1176,          4,  789652004],
       [    237556,       1079,          3,  789652009],
       [    237556,         47,          5,  789652009],
       ...,
       [    280481,        494,          3, 1537945127],
       [     82922,      53519,          4, 1537945130],
       [     82922,     167780,          4, 1537945149]])

In [82]:
def delta(d1, d2, opt='d'):
    delta = 0

    if opt == 'm':
        while True:
            mdays = monthrange(d1.year, d1.month)[1]
            d1 += timedelta(days=mdays)
            if d1 <= d2:
                delta += 1
            else:
                break
    else:
        delta = (d2 - d1).days

    return delta


In [99]:
all_genres = ['Action',
                  'Adventure',
                  'Animation',
                  "Children",
                  'Comedy',
                  'Crime',
                  'Documentary',
                  'Drama',
                  'Fantasy',
                  'Film-Noir',
                  'Horror',
                  'Musical',
                  'Mystery',
                  'Romance',
                  'Sci-Fi',
                  'Thriller',
                  'War',
                  'Western',
                 'IMAX',
                 '(no genres listed)']

In [101]:
len(all_genres)

20

In [109]:
from tqdm import tqdm

In [119]:
# print('Getting ratings...')
# users, ratings = load_ratings(DATA_DIR, size='latest-small')
# print('Getting movies...')
# movies = load_movies(DATA_DIR, size='latest-small')

Getting ratings...
Getting movies...


In [127]:
def fetch_movielens(data_home=None, size='latest-small', metadata = {}):
    assert data_home is not None

    if size not in ('100k', '1m', 'latest-small'):
        raise ValueError("size can only be '100k' or '1m', got %s" % size)

    print('Getting ratings...')
    users, ratings = load_ratings(data_home, size)
    print('Getting movies...')
    movies = load_movies(data_home, size)

    samples = []

    user_ids = {}
    item_ids = {}

    head_date = datetime(*time.localtime(ratings[0, 3])[:6])
    dts = []

    last = {}

    cnt = 0
    print('Processing ratings...')
    for user_id, item_id, rating, timestamp in tqdm(ratings):
        # give an unique user index
        if user_id in user_ids:
            u_index = user_ids[user_id]
        else:
            u_index = len(user_ids)
            user_ids[user_id] = u_index

        # give an unique item index
        if item_id in item_ids:
            i_index = item_ids[item_id]
        else:
            i_index = len(item_ids)
            item_ids[item_id] = i_index

        # delta days
        date = datetime(*time.localtime(timestamp)[:6])
        dt = delta(head_date, date)
        dts.append(dt)

        weekday_vec = np.zeros(7)
        weekday_vec[date.weekday()] = 1

        if user_id in last:
            last_item_vec = last[user_id]['item']
#             last_weekday_vec = last[user_id]['weekday']
        else:
            last_item_vec = np.zeros(20)
#             last_weekday_vec = np.zeros(7)

        others = np.concatenate((weekday_vec, last_item_vec))
#         print('others.shape: ', others.shape)

        user = User(u_index, np.zeros(1))
        item = Item(i_index, movies[item_id])

        sample = Event(user, item, 1., others)
        samples.append(sample)

        # record users' last rated movie features
        last[user_id] = {'item': movies[item_id] }
        cnt += 1
        if cnt > 20000:
            break
    metadata['userids'] = user_ids
    metadata['itemids'] = item_ids
    print('Done loading!')

    # contexts in this dataset
    # 1 delta time, 18 genres, and 23 demographics (1 for M/F, 1 for age, 21 for occupation(0-20))
    # 7 for day of week, 18 for the last rated item genres, 7 for the last day of week
    return Bunch(samples=samples,
                 can_repeat=False,
                 contexts={'others': 7 + 20, 'item': 20, 'user': 1},
                 n_user=len(user_ids),
                 n_item=len(item_ids),
                 n_sample=len(samples))

In [128]:
data = fetch_movielens(data_home=DATA_DIR, size='latest-small')



  0%|          | 0/27753444 [00:00<?, ?it/s][A
  0%|          | 8254/27753444 [00:00<05:36, 82534.77it/s][A


Processing ratings...


  0%|          | 16962/27753444 [00:00<05:30, 83844.34it/s][A
[A

Done loading!


In [114]:
movies = load_movies(DATA_DIR, size='latest-small')

In [116]:
movies[1].shape

(20,)

In [112]:
data.contexts

{'others': 27, 'item': 20}

In [111]:
data.keys()

dict_keys(['samples', 'can_repeat', 'contexts', 'n_user', 'n_item', 'n_sample'])

In [129]:
# converting data into FluRS input object
# data = fetch_movielens(data_home=DATA_DIR, size='latest-small')

# logging.info('initialize recommendation model and evaluation module')
rec = FMRecommender(p=sum(data.contexts.values()),  # number of dimensions of input vector
                    k=40,
                    l2_reg_w0=2.,
                    l2_reg_w=8.,
                    l2_reg_V=16.,
                    learn_rate=.004)
rec.initialize()
evaluator = Evaluator(rec, data.can_repeat)

n_batch_train = int(data.n_sample * 0.2)  # 20% for pre-training to avoid cold-start
n_batch_test = int(data.n_sample * 0.1)  # 10% for evaluation of pre-training
batch_tail = n_batch_train + n_batch_test

# pre-train
# 20% for batch training | 10% for batch evaluate
# after the batch training, 10% samples are used for incremental updating
# logging.info('batch pre-training before streaming input')
evaluator.fit(
    data.samples[:n_batch_train],
    data.samples[n_batch_train:batch_tail],
    n_epoch=1  # single pass even for batch training
)

# 70% incremental evaluation and updating
# logging.info('incrementally predict, evaluate and update the recommender')
res = evaluator.evaluate(data.samples[batch_tail:])

print(res)

<generator object Evaluator.evaluate at 0x7f61c8738228>


In [None]:
dir(res)

In [145]:
for r in res:
    print(r)
    break

(0.29070307510022064, 379, 0.006395000000338769, 0.025908000000526954)


In [93]:
%%time
_, _ = load_ratings(DATA_DIR, size='latest-small')

CPU times: user 25.5 s, sys: 3.6 s, total: 29.1 s
Wall time: 20.4 s


In [None]:
print('hi')

In [27]:
metadata = {}

In [40]:
DATA_DIR= 'data/ml-latest'

In [28]:
data = fetch_movielens(data_home=DATA_DIR, size='1m', metadata=metadata)

In [33]:
with open(os.path.join(DATA_DIR, 'users.dat'), encoding='ISO-8859-1') as f:
    lines = map(lambda l: l.rstrip().split('::'), f.readlines())

In [None]:
lines

In [None]:
data