In [1]:
import os
import zipfile
import csv

import requests


def _download(url: str, dest_path: str):

    req = requests.get(url, stream=True)
    req.raise_for_status()

    with open(dest_path, "wb") as fd:
        for chunk in req.iter_content(chunk_size=2 ** 20):
            fd.write(chunk)


def get_data():

    ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")

    if not os.path.exists("data"):
        os.makedirs("data")

        _download(ratings_url, "data/data.zip")

    with zipfile.ZipFile("data/data.zip") as archive:
        return (
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
                delimiter=";",
            ),
            csv.DictReader(
                (x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
            ),
        )


def get_ratings():

    return get_data()[0]


def get_book_features():

    return get_data()[1]

In [2]:
import json
from itertools import islice

ratings, book_features = get_data()

In [3]:
for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "User-ID": "276725",
    "ISBN": "034545104X",
    "Book-Rating": "0"
}
{
    "User-ID": "276726",
    "ISBN": "0155061224",
    "Book-Rating": "5"
}


In [4]:
for line in islice(book_features, 1):
    print(json.dumps(line, indent=4))

{
    "ISBN": "0195153448",
    "Book-Title": "Classical Mythology",
    "Book-Author": "Mark P. O. Morford",
    "Year-Of-Publication": "2002",
    "Publisher": "Oxford University Press",
    "Image-URL-S": "http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg",
    "Image-URL-M": "http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg",
    "Image-URL-L": "http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"
}


In [20]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

In [23]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 105283, num_items 341762.


In [22]:
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()))

In [8]:
tmp=[x['Book-Author'] for x in get_book_features()]


['Richard Bruce Wright', "Carlo D'Este", 'Gina Bari Kolata', 'E. J. W. Barber']

In [12]:
print(len(tmp))
len(set(tmp))

271379


102043

In [18]:
tmp = [x['ISBN'] for x in get_book_features()]
print(len(tmp))

271379


In [15]:
# look at item feature mapping
a = dataset.mapping()[3]
a

{'034545104X': 0,
 '0155061224': 1,
 '0446520802': 2,
 '052165615X': 3,
 '0521795028': 4,
 '2080674722': 5,
 '3257224281': 6,
 '0600570967': 7,
 '038550120X': 8,
 '342310538': 9,
 '0425115801': 10,
 '0449006522': 11,
 '0553561618': 12,
 '055356451X': 13,
 '0786013990': 14,
 '0786014512': 15,
 '0060517794': 16,
 '0451192001': 17,
 '0609801279': 18,
 '0671537458': 19,
 '0679776818': 20,
 '0943066433': 21,
 '1570231028': 22,
 '1885408226': 23,
 '0747558167': 24,
 '3442437407': 25,
 '033390804X': 26,
 '3596218098': 27,
 '0684867621': 28,
 '0451166892': 29,
 '8440682697': 30,
 '034544003X': 31,
 '0380000059': 32,
 '0380711524': 33,
 '0451167317': 34,
 '0451454952': 35,
 '0843920262': 36,
 '3404122879': 37,
 '3404182928': 38,
 '3404611306': 39,
 '342662429': 40,
 '3426690179': 41,
 '3442424216': 42,
 '3442425573': 43,
 '3453092007': 44,
 '3453157745': 45,
 '3453176944': 46,
 '3453185137': 47,
 '3453185323': 48,
 '3453213025': 49,
 '3453877241': 50,
 '3492226604': 51,
 '3517017442': 52,
 '359

In [46]:
print(len(a.keys()))
list(a.keys())[433448]


443805


'Philip Prowse'

In [13]:
(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

print(repr(interactions))

<105283x341762 sparse matrix of type '<class 'numpy.int32'>'
	with 1149780 stored elements in COOrdinate format>


In [14]:
item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
                                              for x in get_book_features()))
print(repr(item_features))

<341762x443805 sparse matrix of type '<class 'numpy.float32'>'
	with 613141 stored elements in Compressed Sparse Row format>


In [56]:
idx = 2
tt = list(item_features[idx].nonzero())
print(item_features[idx].shape)
tt

(1, 443805)


[array([0, 0], dtype=int32), array([     2, 341954], dtype=int32)]

In [54]:
tt = item_features[idx].todense()

In [55]:
import pandas as pd
tt = pd.DataFrame(tt)
list(map(set,tt.values))# feature weight sum to 1, so if two are nonzero, each take weight 0.5. 

[{0.0, 0.5}]

0.0