# Examples of using explicit features in implicit.ALS model with RecTools

Some models allow using explicit user (sex, age, etc.) and item (genre, year, ...) features.



- Building ALS model
- Adding features to model
- Advanced feature usage

In [2]:
import os

import numpy as np
import pandas as pd

from implicit.als import AlternatingLeastSquares

from rectools import Columns
from rectools.dataset import Dataset, SparseFeatures, DenseFeatures, IdMap, Interactions
from rectools.metrics import (
    MAP,
    MeanInvUserFreq,
    calc_metrics,
)
from rectools.models import ImplicitALSWrapperModel

In [3]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

## Load data

In [4]:
%%time
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

--2022-07-28 11:32:59--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5,6M) [application/zip]
Saving to: ‘ml-1m.zip.5’


2022-07-28 11:33:01 (3,88 MB/s) - ‘ml-1m.zip.5’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         
CPU times: user 41.1 ms, sys: 26.6 ms, total: 67.7 ms
Wall time: 2.45 s


In [5]:
%%time
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    engine="python",  # Because of 2-chars separators
    header=None,
    names=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
)
print(ratings.shape)
ratings.head()

(1000209, 4)
CPU times: user 4.01 s, sys: 177 ms, total: 4.19 s
Wall time: 4.21 s


Unnamed: 0,user_id,item_id,weight,datetime
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
ratings["datetime"] = pd.to_datetime(ratings["datetime"] * 10 ** 9)
ratings["datetime"].min(), ratings["datetime"].max()

(Timestamp('2000-04-25 23:05:32'), Timestamp('2003-02-28 17:49:50'))

In [7]:
%%time
movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    engine="python",  # Because of 2-chars separators
    header=None,
    names=[Columns.Item, "title", "genres"],
)
print(movies.shape)
movies.head()

(3883, 3)
CPU times: user 11.5 ms, sys: 1.6 ms, total: 13.1 ms
Wall time: 12.1 ms


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
%%time
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    engine="python",  # Because of 2-chars separators
    header=None,
    names=[Columns.User, "sex", "age", "occupation", "zip_code"],
)
print(users.shape)
users.head()

(6040, 5)
CPU times: user 22.8 ms, sys: 3.26 ms, total: 26 ms
Wall time: 25.1 ms


Unnamed: 0,user_id,sex,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## Split by train / test and build model without features

For correct model comparison it's better to use cross-valiation, but for simplicity here we are splitting only once

In [9]:
split_dt = pd.Timestamp("2003-01-01")
df_train = ratings.loc[ratings["datetime"] < split_dt]
df_test = ratings.loc[ratings["datetime"] >= split_dt]

In [10]:
metrics = {"MAP": MAP(10), "Novelty": MeanInvUserFreq(10)}

In [11]:
dataset = Dataset.construct(df_train)

In [12]:
def make_base_model():
    # Need to create new base model every time to use same random initializations
    return AlternatingLeastSquares(factors=32, random_state=42, num_threads=4)

In [13]:
%%time
model = ImplicitALSWrapperModel(make_base_model())
model.fit(dataset)
recos = model.recommend(
    users=df_test[Columns.User].unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)
calc_metrics(metrics, recos, df_test, df_train)

CPU times: user 29.1 s, sys: 12.6 s, total: 41.7 s
Wall time: 7.66 s


{'MAP': 0.017172445629322065, 'Novelty': 2.5387145697434614}

## Prepare features

There are 2 kind of features: categorical that are represented as sparse and numerical that are represented as dense.

Sparse is much more popular, even if you have dense features it's often better to binarize them.

Here we have mostly categorical features, only 2 numerical:
- user age, but it's binarized already;
- movie year, we'll binarize it now.

We represent user and item features as flatten dataframes.

### User features

In [14]:
users.isna().sum()

user_id       0
sex           0
age           0
occupation    0
zip_code      0
dtype: int64

In [15]:
users.nunique()

user_id       6040
sex              2
age              7
occupation      21
zip_code      3439
dtype: int64

In [16]:
# Select only users that present in 'ratings' table
users = users.loc[users["user_id"].isin(ratings["user_id"])].copy()

There are too many zip codes, we will not use them now because methods of using features that are available for now work badly with big number of features

In [17]:
# For 3 features generate common flatten table with its values
# Here all features have 1 value per user
# But there can be more than 1 value of feature per user (item) if feature is categorical
user_features_frames = []
for feature in ["sex", "age", "occupation"]:
    feature_frame = users.reindex(columns=["user_id", feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,1,F,sex
1,2,M,sex
2,3,M,sex
3,4,M,sex
4,5,M,sex


### Item features

Here we will use movie genre and year

In [18]:
movies.isna().sum()

item_id    0
title      0
genres     0
dtype: int64

In [19]:
# Select only items that present in 'ratings' table
movies = movies.loc[movies["item_id"].isin(ratings["item_id"])].copy()

#### Genre

In [20]:
# Explode genres to flatten table
movies["genre"] = movies["genres"].str.split("|")
genre_feature = movies[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,1,Animation,genre
0,1,Children's,genre
0,1,Comedy,genre
1,2,Adventure,genre
1,2,Children's,genre


#### Year

In [21]:
# Binarize year to 10 bins and use it as categorica feature
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)").astype(int)
_, bins = pd.qcut(movies["year"], 10, retbins=True)
labels = bins[:-1]
print(labels)

[1919. 1959. 1977. 1986. 1991. 1994. 1995. 1996. 1997. 1999.]


In [22]:
year_feature = pd.DataFrame(
    {
        "id": movies["item_id"],
        "value": pd.cut(movies["year"], bins=bins, labels=labels),
        "feature": "year",
    }
)
year_feature.head()

Unnamed: 0,id,value,feature
0,1,1994.0,year
1,2,1994.0,year
2,3,1994.0,year
3,4,1994.0,year
4,5,1994.0,year


#### Combine

In [23]:
item_features = pd.concat((genre_feature, year_feature))

## Build model with features

There are 2 ways to use features in ALS that implemented in RecTools: 'separately' and 'together'.

Both methods work with dense features. But we prepared data for sparse features because it's more convenient here.
In model sparse matrix will be converted to dense. Be carefull with big datasets, limit the number of features.

**Note:** Training model with features is available for CPU and GPU (as well as training native ALS without features). It is managed by `use_gpu` parameter in `implicit.als.AlternatingLeastSquares`.

### Attempt to use all features

In [24]:
%%time
dataset = Dataset.construct(
    interactions_df=df_train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "occupation"],
    item_features_df=item_features,
    cat_item_features=["year", "genre"],  # If we didn't binarize year, we wouldn't set it here
)

for fit_features_together in (True, False):
    model = ImplicitALSWrapperModel(make_base_model(), fit_features_together=fit_features_together)
    model.fit(dataset)
    recos = model.recommend(
        users=df_test[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, df_test, df_train)
    print(f"Fit features together: {fit_features_together}.  Metrics: {metric_values}")



Fit features together: True.  Metrics: {'MAP': 0.01388652631210521, 'Novelty': 2.5681515162134265}
Fit features together: False.  Metrics: {'MAP': 0.01998701037744824, 'Novelty': 2.2928462029811394}
CPU times: user 1min 54s, sys: 42.5 s, total: 2min 36s
Wall time: 27.5 s


Here we can see decreased MAP for joint feature fitting and increased MAP for separate fitting.

Let's analyze which features have greater influence.

**Note:** We get warning because for methods of using features that are implemented for now features must be represented as dense array, so be careful and do not use big number of features.

### Attempt to use only item features

In [25]:
%%time
dataset = Dataset.construct(
    interactions_df=df_train,
    item_features_df=item_features,
    cat_item_features=["year", "genre"],
)

for fit_features_together in (True, False):
    model = ImplicitALSWrapperModel(make_base_model(), fit_features_together=fit_features_together)
    model.fit(dataset)
    recos = model.recommend(
        users=df_test[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, df_test, df_train)
    print(f"Fit features together: {fit_features_together}.  Metrics: {metric_values}")

Fit features together: True.  Metrics: {'MAP': 0.01520886994621532, 'Novelty': 2.5870053665186767}
Fit features together: False.  Metrics: {'MAP': 0.014805021434661019, 'Novelty': 2.8426244093501594}
CPU times: user 1min 43s, sys: 34.9 s, total: 2min 18s
Wall time: 22.9 s


Here we see decreased MAP values.

### Attempt to use only user features

In [26]:
%%time
dataset = Dataset.construct(
    interactions_df=df_train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "occupation"],
)

for fit_features_together in (True, False):
    model = ImplicitALSWrapperModel(make_base_model(), fit_features_together=fit_features_together)
    model.fit(dataset)
    recos = model.recommend(
        users=df_test[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, df_test, df_train)
    print(f"Fit features together: {fit_features_together}.  Metrics: {metric_values}")

Fit features together: True.  Metrics: {'MAP': 0.013806802876374652, 'Novelty': 2.506039447145849}
Fit features together: False.  Metrics: {'MAP': 0.02285698694879273, 'Novelty': 1.9870892804731672}
CPU times: user 1min 46s, sys: 40.2 s, total: 2min 27s
Wall time: 30.3 s


Here we see that user features increase MAP a lot if fit separately, but Novelty decreases.

### Attempt to use only user features with increased features weight

In [27]:
%%time
dataset = Dataset.construct(
    interactions_df=df_train.eval("weight = weight / 10"),  # decrease interactions weight => increase features weight
    user_features_df=user_features,
    cat_user_features=["sex", "age", "occupation"],
)
for fit_features_together in (True, False):
    model = ImplicitALSWrapperModel(make_base_model(), fit_features_together=fit_features_together)
    model.fit(dataset)
    recos = model.recommend(
        users=df_test[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, df_test, df_train)
    print(f"Fit features together: {fit_features_together}.  Metrics: {metric_values}")



Fit features together: True.  Metrics: {'MAP': 0.01676492598017664, 'Novelty': 2.2798496191085693}
Fit features together: False.  Metrics: {'MAP': 0.016931541817397328, 'Novelty': 2.069402889295814}
CPU times: user 1min 29s, sys: 28 s, total: 1min 57s
Wall time: 21.1 s


If we use much bigger weight for features compared to weight of interactions, values of metrics become more similar.

## Advanced features usage

In [28]:
# Prepare explicit_id <-> implicit_id mapping
id_map = IdMap.from_values(["u1", "u2", "u3"])
display(id_map.to_internal)
display(id_map.to_external)

u1    0
u2    1
u3    2
dtype: int64

0    u1
1    u2
2    u3
dtype: object

### Sparse features

When using `Dataset.construct` with features, we call `SparseFeatures.from_flatten`.

All features are converted to CSR matrix.

In [29]:
features_df = pd.DataFrame(
    [
        ["u1", "feature_1", "x"],
        ["u1", "feature_1", "y"],
        ["u1", "feature_2", 123],
        ["u2", "feature_2", 123],
        ["u3", "feature_2", 150],
        ["u3", "feature_1", "x"],
    ],
    columns=["id", "feature", "value"],
)

In [30]:
# Categorical features are converted to one-hot encoded format
sf = SparseFeatures.from_flatten(features_df, id_map, cat_features=["feature_1", "feature_2"])
print(sf.names)
sf.values.toarray()

(('feature_1', 'x'), ('feature_1', 'y'), ('feature_2', 123), ('feature_2', 150))


array([[1., 1., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 1.]], dtype=float32)

In [31]:
# Non-categorical features remain 'as is'
sf = SparseFeatures.from_flatten(features_df, id_map, cat_features=["feature_1"])
print(sf.names)
sf.values.toarray()

(('feature_2', '__is_direct_feature'), ('feature_1', 'x'), ('feature_1', 'y'))


array([[123.,   1.,   1.],
       [123.,   0.,   0.],
       [150.,   1.,   0.]], dtype=float32)

**Important:** All non-numeric features must be categorical

In [32]:
# If you want to increase feature weight you can use 'weight' column
features_weighted_df = features_df.copy()
features_weighted_df["weight"] = 1
features_weighted_df.loc[[0, 1, 2], "weight"] = 2
sf = SparseFeatures.from_flatten(features_weighted_df, id_map, cat_features=["feature_1"])
print(sf.names)
sf.values.toarray()

(('feature_2', '__is_direct_feature'), ('feature_1', 'x'), ('feature_1', 'y'))


array([[246.,   2.,   2.],
       [123.,   0.,   0.],
       [150.,   1.,   0.]], dtype=float32)

### Dense features

If you have features in 'classic' format, you can use `DenseFeatures`.

When creating dataset with `Dataset.construct` use parameter `make_dense_user_features` (`make_dense_item_features`), in this case `DenseFeatures.from_dataframe` will be used.

All features are saved 'as is'.

**Important:** Use only numeric features.

**Important:** You must set features for all objects (users or features). If you do not have some feature for some user (item) then use any method (zero, mean value, etc.) to fill it.

In [33]:
features_df = pd.DataFrame(
    [
        ["u1", 10, 0.5, 22],
        ["u2", 202, 0, 2.5],
        ["u3", 0.01, 1, 10],
    ],
    columns=["id", "feature_1", "feature_2", "feature_3"],
)

In [34]:
dense_features = DenseFeatures.from_dataframe(features_df, id_map)
print(dense_features.names)
dense_features.values

('feature_1', 'feature_2', 'feature_3')


array([[1.00e+01, 5.00e-01, 2.20e+01],
       [2.02e+02, 0.00e+00, 2.50e+00],
       [1.00e-02, 1.00e+00, 1.00e+01]], dtype=float32)

### Building dataset with manually created features

If you want, you can create features manually and then build dataset with it

In [35]:
# Prepare id maps
user_id_map = IdMap.from_values(["u1", "u2", "u3"])
item_id_map = IdMap.from_values(["i1", "i2"])
display(user_id_map.to_internal)
display(item_id_map.to_internal)

u1    0
u2    1
u3    2
dtype: int64

i1    0
i2    1
dtype: int64

In [36]:
# Prepare interactions
interactions_df = pd.DataFrame(
    {
        Columns.User: ["u1", "u1", "u2"],
        Columns.Item: ["i1", "i2", "i1"],
        Columns.Weight: 1,
        Columns.Datetime: 1,
    }
)
interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map)
interactions.df

Unnamed: 0,user_id,item_id,weight,datetime
0,0,0,1.0,1970-01-01 00:00:00.000000001
1,0,1,1.0,1970-01-01 00:00:00.000000001
2,1,0,1.0,1970-01-01 00:00:00.000000001


In [37]:
# Prepare user features
user_features_df = pd.DataFrame(
    [
        ["u1", "feature_1", "x"],
        ["u1", "feature_1", "y"],
        ["u1", "feature_2", 123],
        ["u2", "feature_2", 123],
        ["u3", "feature_2", 150],
        ["u3", "feature_1", "x"],
    ],
    columns=["id", "feature", "value"],
)
user_features = SparseFeatures.from_flatten(user_features_df, user_id_map, cat_features=["feature_1"])
user_features.values.toarray()

array([[123.,   1.,   1.],
       [123.,   0.,   0.],
       [150.,   1.,   0.]], dtype=float32)

In [38]:
# Prepare item features
item_features_df = pd.DataFrame(
    [
        ["i1", 10, 0.5, 22],
        ["i2", 202, 0, 2.5],
    ],
    columns=["id", "feature_1", "feature_2", "feature_3"],
)
item_features = DenseFeatures.from_dataframe(item_features_df, item_id_map)
item_features.values

array([[ 10. ,   0.5,  22. ],
       [202. ,   0. ,   2.5]], dtype=float32)

**Note:** `"u3"` not in `interactions_df` but we can add it manually to dataset (and features) using `IdMap`. It's not possible if you use `Dataset.construct`.

In [39]:
Dataset(
    user_id_map=user_id_map,
    item_id_map=item_id_map,
    interactions=interactions,
    user_features=user_features,
    item_features=item_features,
)

Dataset(user_id_map=IdMap(to_internal=u1    0
u2    1
u3    2
dtype: int64), item_id_map=IdMap(to_internal=i1    0
i2    1
dtype: int64), interactions=Interactions(df=   user_id  item_id  weight                      datetime
0        0        0     1.0 1970-01-01 00:00:00.000000001
1        0        1     1.0 1970-01-01 00:00:00.000000001
2        1        0     1.0 1970-01-01 00:00:00.000000001), user_features=SparseFeatures(values=<3x3 sparse matrix of type '<class 'numpy.float32'>'
	with 6 stored elements in Compressed Sparse Row format>, names=(('feature_2', '__is_direct_feature'), ('feature_1', 'x'), ('feature_1', 'y'))), item_features=DenseFeatures(values=array([[ 10. ,   0.5,  22. ],
       [202. ,   0. ,   2.5]], dtype=float32), names=('feature_1', 'feature_2', 'feature_3')))