In [27]:
import pandas as pd
import numpy as np
from lightfm import LightFM

## 1. Data Pre-Proceesing

### A- User Interactions

In [28]:
ratings_df = pd.read_csv("Data/Data_LightFm/ratings.csv")

In [29]:
ratings_df["userId"] = ratings_df["userId"].astype("int32")
ratings_df["movieId"] = ratings_df["movieId"].astype("int32")
ratings_df["rating"] = ratings_df["rating"].astype("float32")

In [30]:
#ratings_df.drop(["timestamp"], axis=1, inplace=True)
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5


In [31]:
ratings_df.values

array([[1.00000e+00, 2.00000e+00, 3.50000e+00],
       [1.00000e+00, 2.90000e+01, 3.50000e+00],
       [1.00000e+00, 3.20000e+01, 3.50000e+00],
       ...,
       [1.38493e+05, 6.96440e+04, 3.00000e+00],
       [1.38493e+05, 7.02860e+04, 5.00000e+00],
       [1.38493e+05, 7.16190e+04, 2.50000e+00]])

### B- Movie Features

In [32]:
movies_df = pd.read_csv("Data/Data_LightFm/movies.csv")

In [33]:
movies_df.head(2)

Unnamed: 0,movieId,avg_movie_rating,title,movie_youth_rate,movie_popularity_rate,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,3.92124,toy story,0.83871,0.738297,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,3.211977,jumanji,0.83871,0.330446,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
movies_df = movies_df.round(2)

In [35]:
movies_df.columns

Index(['movieId', 'avg_movie_rating', 'title', 'movie_youth_rate',
       'movie_popularity_rate', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'War', 'Western'],
      dtype='object')

In [36]:
movies_df.drop(["avg_movie_rating","movie_youth_rate","movie_popularity_rate"], inplace=True, axis=1)

In [37]:
movie_features_matrix = pd.DataFrame(
    {
        "movieId": movies_df["movieId"],
        "features": movies_df[movies_df.columns.drop("movieId")].to_dict(
            orient="records"
        ),
    }
)
movie_features_matrix.head(2)

Unnamed: 0,movieId,features
0,1,"{'title': 'toy story', 'Action': 0, 'Adventure..."
1,2,"{'title': 'jumanji', 'Action': 0, 'Adventure':..."


In [38]:
movie_features_matrix["features"] = movie_features_matrix["features"].apply(
    lambda x: list(f"{k}:{v}" for k, v in x.items())
)

In [39]:
movie_features = (
    movie_features_matrix["features"].explode().reset_index(drop=True).unique()
)
movie_features

array(['title:toy story', 'Action:0', 'Adventure:1', ...,
       'title:feuer eis  dosenbier', 'title:the pirates',
       'title:rentun ruusu'], dtype=object)

In [40]:
movie_features_matrix=list(zip(movie_features_matrix.movieId, movie_features_matrix.features))
movie_features_matrix

[(1,
  ['title:toy story',
   'Action:0',
   'Adventure:1',
   'Animation:1',
   'Children:1',
   'Comedy:1',
   'Crime:0',
   'Documentary:0',
   'Drama:0',
   'Fantasy:1',
   'Film-Noir:0',
   'Horror:0',
   'IMAX:0',
   'Musical:0',
   'Mystery:0',
   'Romance:0',
   'Sci-Fi:0',
   'Thriller:0',
   'War:0',
   'Western:0']),
 (2,
  ['title:jumanji',
   'Action:0',
   'Adventure:1',
   'Animation:0',
   'Children:1',
   'Comedy:0',
   'Crime:0',
   'Documentary:0',
   'Drama:0',
   'Fantasy:1',
   'Film-Noir:0',
   'Horror:0',
   'IMAX:0',
   'Musical:0',
   'Mystery:0',
   'Romance:0',
   'Sci-Fi:0',
   'Thriller:0',
   'War:0',
   'Western:0']),
 (3,
  ['title:grumpier old men',
   'Action:0',
   'Adventure:0',
   'Animation:0',
   'Children:0',
   'Comedy:1',
   'Crime:0',
   'Documentary:0',
   'Drama:0',
   'Fantasy:0',
   'Film-Noir:0',
   'Horror:0',
   'IMAX:0',
   'Musical:0',
   'Mystery:0',
   'Romance:1',
   'Sci-Fi:0',
   'Thriller:0',
   'War:0',
   'Western:0']),
 (4,


---

## 2. Dataset preparation

In [41]:
from lightfm.data import Dataset

dataset = Dataset()

In [42]:
dataset.fit(
    users=ratings_df["userId"],
    items=ratings_df["movieId"],
    item_features=movie_features,
)

In [43]:
dataset.model_dimensions()

(138493, 52358)

In [44]:
dataset.item_features_shape()

(26689, 52358)

### A- Interactions

In [45]:
interactions, weights = dataset.build_interactions(ratings_df.values)

In [46]:
from lightfm.cross_validation import random_train_test_split

train_interactions, test_interactions = random_train_test_split(
    interactions, test_percentage=0.2, random_state=42
)
train_weights, test_weights = random_train_test_split(
    weights, test_percentage=0.2, random_state=42
)

In [47]:
train_interactions, test_interactions

(<138493x26689 sparse matrix of type '<class 'numpy.int32'>'
 	with 15999532 stored elements in COOrdinate format>,
 <138493x26689 sparse matrix of type '<class 'numpy.int32'>'
 	with 3999883 stored elements in COOrdinate format>)

In [48]:
train_weights, test_weights

(<138493x26689 sparse matrix of type '<class 'numpy.float32'>'
 	with 15999532 stored elements in COOrdinate format>,
 <138493x26689 sparse matrix of type '<class 'numpy.float32'>'
 	with 3999883 stored elements in COOrdinate format>)

### B- Movies Features

In [49]:
processed_movie_features = dataset.build_item_features(movie_features_matrix)

---

## 3. Model

In [50]:
model = LightFM(loss="warp", learning_schedule="adagrad", random_state=42)
model.fit(
    interactions=train_interactions,
    sample_weight=train_weights,
    item_features=processed_movie_features,
    epochs=50,
    num_threads=14,
    verbose=True,
)

Epoch:  18%|█▊        | 9/50 [06:53<29:02, 42.51s/it]

---

## 4. Evaulation

In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

train_roc_auc = auc_score(
    model, train_interactions, item_features=processed_movie_features, num_threads=14
).mean()
test_roc_auc = auc_score(
    model, test_interactions, item_features=processed_movie_features, num_threads=14
).mean()

train_precision = precision_at_k(
    model, train_interactions, item_features=processed_movie_features, num_threads=14
).mean()
test_precision = precision_at_k(
    model,
    test_interactions,
    train_interactions=train_interactions,
    item_features=processed_movie_features,
    k=15,
    num_threads=14,
).mean()

train_recall = recall_at_k(
    model, train_interactions, item_features=processed_movie_features, num_threads=14
).mean()
test_recall = recall_at_k(
    model,
    test_interactions,
    train_interactions=train_interactions,
    k=15,
    item_features=processed_movie_features,
    num_threads=14,
).mean()

In [None]:
print(f"Train ROC AUC: {train_roc_auc:.2f}\tTest ROC AUC: {test_roc_auc:.2f}")
print(f"Train Precision: {train_precision:.2f}\tTest Precision: {test_precision:.2f}")
print(f"Train Recall: {train_recall:.2f}\tTest Recall: {test_recall:.2f}")

Train ROC AUC: 0.99	Test ROC AUC: 0.98
Train Precision: 0.47	Test Precision: 0.24
Train Recall: 0.08	Test Recall: 0.18
