In [4]:
!python --version

Python 3.9.21


In [1]:
import pandas as pd

# Load ratings data
ratings = pd.read_json('data/ratings.jsonl', lines=True)

# Load content data (item features such as genres and year)
content = pd.read_json('data/content.jsonl', lines=True)

# Display a sample of the data
print(ratings.head())
print(content.head())


KeyboardInterrupt: 

In [6]:
# Preprocess content features
content['Year'] = pd.to_numeric(content['Year'], errors='coerce')  # Ensure year is numeric

# Flatten item features into (item_id, feature_name) format
item_features_data = [
    (row['ItemId'], feature)
    for _, row in content.iterrows()
    for feature in (row['Genre'].split(', ') + [str(row['Year'])])  # Combine genre and year as features
]

# Check the format of item features data
print(item_features_data[:5])  # Display first 5 items with features



[('c9f0f895fb', 'Documentary'), ('c9f0f895fb', 'Short'), ('c9f0f895fb', '1894.0'), ('d3d9446802', 'Documentary'), ('d3d9446802', 'Short')]


In [7]:
# Collect all unique features from the content data
all_features = set()
for _, row in content.iterrows():
    # Add genres and year as features
    features = row['Genre'].split(', ') + [str(row['Year'])]
    all_features.update(features)

# Convert to a list
all_features = list(all_features)

print(f"Total unique features: {len(all_features)}")
print(all_features[:10])  # Display a sample


Total unique features: 156
['2020.0', '1976.0', '2015.0', 'Family', '1993.0', '1965.0', '1974.0', '1983.0', '1995.0', '2009.0']


In [8]:
from lightfm.data import Dataset

# Initialize the dataset
dataset = Dataset()

# Include all unique item IDs from both ratings and content datasets
all_item_ids = set(ratings['ItemId'].unique()).union(content['ItemId'].unique())

# Fit the dataset with all user and item IDs
dataset.fit(users=ratings['UserId'].unique(),
            items=list(all_item_ids),
            item_features=all_features)






In [9]:
# Build item features matrix
item_features_data = [
    (row['ItemId'], row['Genre'].split(', ') + [str(row['Year'])])
    for _, row in content.iterrows()
]

item_features_matrix = dataset.build_item_features(item_features_data, normalize=True)

# Check the shape of the matrix
print(f"Item features matrix shape: {item_features_matrix.shape}")


Item features matrix shape: (38012, 38168)


In [10]:
item_features_matrix

<38012x38168 sparse matrix of type '<class 'numpy.float32'>'
	with 160606 stored elements in Compressed Sparse Row format>

In [11]:
from lightfm import LightFM

# Build interactions matrix from ratings data
(interactions, _) = dataset.build_interactions(
    [(row['UserId'], row['ItemId'], row['Rating']) for _, row in ratings.iterrows()]
)

In [12]:
import os

os.cpu_count()

4

: 

In [None]:
from tqdm import tqdm  # For a progress bar

# Initialize the LightFM model
model = LightFM(loss='warp')

model.fit(interactions, item_features=item_features_matrix, epochs=10, num_threads=os.cpu_count(), verbose=True)
print(f"Epoch {epoch + 1}/{num_epochs} completed.")
    
print("Model training completed.")

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]