In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import csv
from collections import defaultdict
import json
from lightfm import LightFM
from lightfm.data import Dataset

# Default path to data files
PATH = "../data/"

In [6]:
# Load user-item interaction data
interaction_data = pd.read_csv(
    PATH + 'ml-100k/u.data',
    sep='\t',
    encoding="latin1",
    names=['user_id', 'item_id', 'rating', 'timestamp']
    )[['user_id', 'item_id', 'rating']]
display(interaction_data.shape)
interaction_data.head(5)

(100000, 3)

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [32]:
# Instantiate a defaultdict to hold user features
user_data = defaultdict(dict)

# Read data and build user features dictionary
def load_feature(file_path, feature_name):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            userId = row['userId']
            value = row[feature_name]
            user_data[userId][feature_name] = value

# Load each feature file
load_feature(PATH + 'ageRel.csv', 'age')
load_feature(PATH + 'genderRel.csv', 'gender')
load_feature(PATH + 'occupationRel.csv', 'occupation')
load_feature(PATH + 'residesRel.csv', 'zipcode')

# Build user features list
user_features_raw = [
    (userId, [f'age:{data["age"]}', f'gender:{data["gender"]}', f'occupation:{data["occupation"]}', f'zipcode:{data["zipcode"]}'])
    for userId, data in user_data.items()
]

# Display first 5 user features
for item in user_features_raw[:5]:
    print(item)

('1', ['age:24', 'gender:M', 'occupation:technician', 'zipcode:85'])
('2', ['age:53', 'gender:F', 'occupation:other', 'zipcode:94'])
('3', ['age:23', 'gender:M', 'occupation:writer', 'zipcode:32'])
('4', ['age:24', 'gender:M', 'occupation:technician', 'zipcode:43'])
('5', ['age:33', 'gender:F', 'occupation:other', 'zipcode:15'])


In [33]:
# Instantiate a defaultdict to hold item features
item_data = defaultdict(lambda: defaultdict(list))

# Read data and build item features dictionary
# Modified version to handle multiple genres
def load_feature(file_path, feature_name):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            itemId = row['movieId']
            value = row[feature_name]
            if feature_name == 'genreDesc':
                item_data[itemId]['genre'].append(value)
            else:
                item_data[itemId][feature_name] = value

# Load each feature file
load_feature(PATH + 'releaseRel.csv', 'releaseDate')
load_feature(PATH + 'genreRel.csv', 'genreDesc')

# Build item features list
item_features_raw = [
    (
        itemId,
        [f'releaseDate:{data["releaseDate"]}'] +
        [f'genre:{genre}' for genre in data['genre']]
    )
    for itemId, data in item_data.items()
]

# Display first 5 item features
for item in item_features_raw[:5]:
    print(item)

('2', ['releaseDate:Jan-1995', 'genre:Action', 'genre:Adventure', 'genre:Thriller'])
('4', ['releaseDate:Jan-1995', 'genre:Action', 'genre:Comedy', 'genre:Drama'])
('17', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Comedy', 'genre:Crime', 'genre:Horror', 'genre:Thriller'])
('21', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Adventure', 'genre:Comedy', 'genre:Musical', 'genre:Thriller'])
('22', ['releaseDate:Feb-1996', 'genre:Action', 'genre:Drama', 'genre:War'])


In [34]:
# Load test item IDs from the json file saved
# previously from Knowledge Graph Method
with open('../experiments/test_ids.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
# Extract test item IDs as integers
test_item_ids = [item['movieId'] for item in data]
print(f"items on test set: {len(test_item_ids)}")
display(test_item_ids[:5])

items on test set: 30


['159', '458', '679', '128', '658']

In [None]:
# Split data into train and test sets for later evaluation
# with LightFM the same way as done in Knowledge Graph Method

# Interaction data for training (excluding test items)
train_interactions_df = interaction_data[
    ~interaction_data['item_id'].astype(str).isin(test_item_ids)]
# Interaction data for testing (only test items)
test_interactions_df = interaction_data[
    interaction_data['item_id'].astype(str).isin(test_item_ids)]

# Item side features for training (only users in train interactions)
train_item_features = [item for item in item_features_raw if item[0] not in test_item_ids]
# Item side features for testing (only testing items)
test_item_features = [item for item in item_features_raw if item[0] in test_item_ids]

In [43]:
# Build lightfm Dataset
dataset = Dataset()

# User and Items unique ids from training interactions
train_user_ids = train_interactions_df['user_id'].astype(str).unique()
train_item_ids = train_interactions_df['item_id'].astype(str).unique()

# Unique features from user and item features
user_feature_set = set(f for _, feats in user_features_raw for f in feats)
item_feature_set = set(f for _, feats in train_item_features for f in feats)

# Partial fit
dataset.fit(
    users=train_user_ids,
    items=train_item_ids,
    user_features=user_feature_set,
    item_features=item_feature_set
)

# Build training matrices
(interactions, weights) = dataset.build_interactions(
    [(str(row['user_id']), str(row['item_id']), row['rating']) for _, row in train_interactions_df.iterrows()]
)

user_features = dataset.build_user_features(user_features_raw)
item_features = dataset.build_item_features(train_item_features)