In [2]:
!python --version

Python 3.9.21


In [3]:
import pandas as pd

# Load ratings data
ratings = pd.read_json('data/ratings.jsonl', lines=True)

# Load content data (item features such as genres and year)
content = pd.read_json('data/content.jsonl', lines=True)

# Display a sample of the data
print(ratings.head())
print(content.head())


       UserId      ItemId           Timestamp  Rating
0  c4ca4238a0  91766eac45 2013-10-05 22:00:50       8
1  c81e728d9d  5c739554f7 2013-08-17 16:26:38       9
2  c81e728d9d  48f6d7ce7c 2013-08-17 13:28:27       8
3  c81e728d9d  e9318d627a 2013-06-15 15:38:09       1
4  a87ff679a2  17e6357973 2014-01-31 23:27:59       8
       ItemId                                          Title  Year      Rated  \
0  c9f0f895fb         Edison Kinetoscopic Record of a Sneeze  1894        N/A   
1  d3d9446802                            Leaving the Factory  1895  Not Rated   
2  c20ad4d76f                         The Arrival of a Train  1896  Not Rated   
3  8e296a067a  The Oxford and Cambridge University Boat Race  1895        N/A   
4  54229abfcf                         The House of the Devil  1896  Not Rated   

      Released Runtime               Genre                        Director  \
0  09 Jan 1894   1 min  Documentary, Short            William K.L. Dickson   
1  22 Mar 1895   1 min  Documenta

In [4]:
# Preprocess content features
content['Year'] = pd.to_numeric(content['Year'], errors='coerce')  # Ensure year is numeric

# Flatten item features into (item_id, feature_name) format
item_features_data = [
    (row['ItemId'], feature)
    for _, row in content.iterrows()
    for feature in (row['Genre'].split(', ') + [str(row['Year'])])  # Combine genre and year as features
]

# Check the format of item features data
print(item_features_data[:5])  # Display first 5 items with features



[('c9f0f895fb', 'Documentary'), ('c9f0f895fb', 'Short'), ('c9f0f895fb', '1894.0'), ('d3d9446802', 'Documentary'), ('d3d9446802', 'Short')]


In [5]:
# Collect all unique features from the content data
all_features = set()
for _, row in content.iterrows():
    # Add genres and year as features
    features = row['Genre'].split(', ') + [str(row['Year'])]
    all_features.update(features)

# Convert to a list
all_features = list(all_features)

print(f"Total unique features: {len(all_features)}")
print(all_features[:10])  # Display a sample


Total unique features: 156
['1962.0', '1954.0', 'Drama', '1966.0', '1926.0', 'Crime', 'Talk-Show', '2007.0', '2012.0', '1913.0']


In [6]:
from lightfm.data import Dataset

# Initialize the dataset
dataset = Dataset()

# Include all unique item IDs from both ratings and content datasets
all_item_ids = set(ratings['ItemId'].unique()).union(content['ItemId'].unique())

# Fit the dataset with all user and item IDs
dataset.fit(users=ratings['UserId'].unique(),
            items=list(all_item_ids),
            item_features=all_features)






In [7]:
# Build item features matrix
item_features_data = [
    (row['ItemId'], row['Genre'].split(', ') + [str(row['Year'])])
    for _, row in content.iterrows()
]

item_features_matrix = dataset.build_item_features(item_features_data, normalize=True)

# Check the shape of the matrix
print(f"Item features matrix shape: {item_features_matrix.shape}")


Item features matrix shape: (38012, 38168)


In [8]:
item_features_matrix

<38012x38168 sparse matrix of type '<class 'numpy.float32'>'
	with 160606 stored elements in Compressed Sparse Row format>

In [9]:
from lightfm import LightFM

# Build interactions matrix from ratings data
(interactions, _) = dataset.build_interactions(
    [(row['UserId'], row['ItemId'], row['Rating']) for _, row in ratings.iterrows()]
)

: 

In [None]:
from tqdm import tqdm  # For a progress bar

# Initialize the LightFM model
model = LightFM(loss='warp')

model.fit_partial(interactions, item_features=item_features_matrix, epochs=10, num_threads=1, verbose=True)
print(f"Epoch {epoch + 1}/{num_epochs} completed.")
    
print("Model training completed.")

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]