In [3]:
import sys
import os


from lightfm import LightFM
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

In [4]:
import os

import pandas as pd


def _read_recs(path: str):
    data = pd.read_csv(path, delimiter='\t', header=None)
    data.columns = ["user_id", "item_id", "rating", "timestamp"]
    return data

def _read_users(path: str):
    users = pd.read_csv(path, delimiter='|', header=None)
    users.columns = ["user_id", "age", "gender", "occupation", "zip_code"]
    return users

def _read_movies(path: str):
    movies = pd.read_csv(path, delimiter='|', header=None, encoding='latin-1')
    movies.columns = ["movie_id", "title", "release_date", "video_release_date", "IMBD_url"] + [f"genre_{x}" for x in range(19)]
    movies = movies.drop("video_release_date", axis=1)
    movies = movies.fillna(method='ffill')
    movies.release_date = pd.to_datetime(movies.release_date)
    return movies

def _read_genre(path: str):
    genres = pd.read_csv(path, delimiter='|', header=None)
    genres.columns = ["genre", "id"]
    return genres

def _read_occupation(path):
    occupation = pd.read_csv(path, delimiter='|', header=None)
    return occupation

def load_part(dataset_path: str, part: int):
    train_path = os.path.join(dataset_path, f"u{part}.base")
    test_path = os.path.join(dataset_path, f"u{part}.test")

    train_part = _read_recs(train_path)
    test_part = _read_recs(test_path)
    return train_part, test_part
    
def load_datasets(dataset_path: str):
    users = _read_users(os.path.join(dataset_path, "u.user"))
    items = _read_movies(os.path.join(dataset_path, "u.item"))
    parts = []
    for part in range(1, 6):
        parts.append(load_part(dataset_path, part))
    return users, items, parts

In [5]:
users, items, parts = load_datasets("../data/raw/ml-100k")

  movies = movies.fillna(method='ffill')


In [6]:
categories_occupation = [
    ['administrator', 'artist', 'doctor', 'educator', 'engineer',
     'entertainment', 'executive', 'healthcare', 'homemaker', 'lawyer',
     'librarian', 'marketing', 'none', 'other', 'programmer', 'retired',
     'salesman', 'scientist', 'student', 'technician', 'writer']
]
encoder = OneHotEncoder(categories=categories_occupation, sparse=False)  # Specify sparse=False to get a dense array


def get_user_item_sparse(interactions: pd.DataFrame, threshold=0):
    data = interactions[interactions['rating'] > threshold]

    rows = data['user_id'] - 1
    cols = data['item_id'] - 1
    data = data['rating'] / 5
    
    return csr_matrix((data, (rows, cols)), shape=(943, 1682))

def get_user_features_sparse(users: pd.DataFrame):
    normalised = users.drop('zip_code', axis=1)
    # Normalize 'age'
    normalised['age'] = users['age'] / 73
    
    # Drop the original 'occupation' column and concatenate the encoded one-hot columns
    normalised = pd.get_dummies(normalised, columns=['occupation'])
    normalised.gender = normalised.gender == 'M'
    normalised.user_id = normalised.user_id - 1
    normalised.set_index('user_id', inplace=True)
    normalised = normalised.astype('float32')
    return csr_matrix(normalised, shape=(943, 23))

def get_item_features_sparse(items: pd.DataFrame):
    normalised = items.drop(["release_date", "title", "IMBD_url"], axis=1)
    normalised.movie_id = normalised.movie_id - 1
    normalised.set_index('movie_id', inplace=True)
    return csr_matrix(normalised, shape=(1682, 19))
    
user_features = get_user_features_sparse(users)
item_features = get_item_features_sparse(items)
user_features.shape, item_features.shape

((943, 23), (1682, 19))

In [7]:
train_part1 = get_user_item_sparse(parts[0][0])
test_part2 = get_user_item_sparse(parts[0][1], threshold=2.5)
train_part1.shape, test_part2.shape

((943, 1682), (943, 1682))

In [36]:
model = LightFM(no_components=10, loss='warp', item_alpha=1e-4, learning_rate=0.01, max_sampled=40)
# model = LightFM(no_components=40, loss='bpr')

model.fit(train_part1, user_features=user_features, item_features=item_features, epochs=40, num_threads=4)

from lightfm.evaluation import precision_at_k, recall_at_k
precision = precision_at_k(model, test_part2, user_features=user_features, item_features=item_features, train_interactions=train_part1).mean()
recal = recall_at_k(model, test_part2, user_features=user_features, item_features=item_features, train_interactions=train_part1).mean()
# auc_score(model, test_part2, user_features=user_features, item_features=item_features)
precision, recal

(0.1037037, 0.032220633730058415)

(0.069934644, 0.020823012309777823)