In [45]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset



In [46]:
# Load users with searches and products
train = pd.read_csv("./dataset/train_LFM.csv")
test = pd.read_csv("./dataset/test_LFM.csv")

# Warning, not using relevant information that is relevant only wrt search AND property together such as the distance user-property, or better price than competitor...
# Not using: position, price_usd, orig_destination_distance, random_bool, booking_bool, comp_inv, comp_rate_percent_diff, interaction
user_src_feats = [
    "site_id", "visitor_location_country_id", "srch_destination_id", 
    "srch_length_of_stay", "srch_adults_count", "srch_children_count", "srch_room_count", 
    "srch_saturday_night_bool", #"srch_query_affinity_score"
    ]
prop_feats = ["prop_country_id", "prop_starrating", "prop_review_score", 
              "prop_brand_bool", "prop_location_score1", "prop_location_score2", 
              "prop_log_historical_price", "price_usd", "promotion_flag"
              ]

In [47]:
prop_ids = train["prop_id"].unique()
src_ids = train["srch_id"].unique()

In [48]:
prop_features = []
src_features = []

for idx, prop_id in enumerate(prop_ids[:2000]):
    if idx%5000 == 0:
        print("Prop percentage: "+str(100*idx/len(prop_ids))+"%")

    current_prop = train[train["prop_id"] == prop_id]
    features = {
        "prop_country_id" : current_prop["prop_country_id"].mean(),
        "prop_starrating" : current_prop["prop_starrating"].mean(),
        "prop_review_score" : current_prop["prop_review_score"].mean() if pd.notnull(current_prop["prop_review_score"].mean()) else train["prop_review_score"].mean(),
        "prop_brand_bool" : current_prop["prop_brand_bool"].mean(), 
        "prop_location_score1" : current_prop["prop_location_score1"].mean(), 
        "prop_location_score2" : current_prop["prop_location_score2"].mean() if pd.notnull(current_prop["prop_location_score2"].mean()) else train["prop_location_score2"].mean(),
        "prop_log_historical_price" : current_prop["prop_log_historical_price"].mean(), 
        "price_usd" : current_prop["price_usd"].mean(), 
        "promotion_flag" : current_prop["promotion_flag"].mean()
    }
    prop_features.append((prop_id, features))
    
for idx, src_id in enumerate(src_ids[:2000]):
    if idx%5000 == 0:
        print("Srch percentage: "+str(100*idx/len(src_ids))+"%")

    current_src = train[train["srch_id"] == src_id]
    features = {
        "visitor_location_country_id" : current_src["visitor_location_country_id"].iloc[0], 
        "srch_destination_id" : current_src["srch_destination_id"].iloc[0],
        "srch_length_of_stay" : current_src["srch_length_of_stay"].iloc[0], 
        "srch_adults_count" : current_src["srch_adults_count"].iloc[0], 
        "srch_children_count" : current_src["srch_children_count"].iloc[0],
        "srch_room_count" : current_src["srch_room_count"].iloc[0], 
        "srch_saturday_night_bool" : current_src["srch_saturday_night_bool"].iloc[0], 
        #"srch_query_affinity_score" : current_src["srch_query_affinity_score"].iloc[0] if pd.notnull(current_src["srch_query_affinity_score"].iloc[0]) else current_src["srch_query_affinity_score"].mean()
    }
    src_features.append((src_id, features))

Prop percentage: 0.0%
Srch percentage: 0.0%


In [49]:
interaction_list = []

interaction_df = train.groupby(["srch_id", "prop_id"])["interaction"].first().reset_index()
for tuple in interaction_df.itertuples():
    interaction_list.append((tuple.srch_id, tuple.prop_id, tuple.interaction))

In [50]:
# Initialize the dataset
dataset = Dataset()

# Fit the dataset with user and item ids and features
dataset.fit(
    users=src_ids,
    items=prop_ids,
    user_features=[
        "visitor_location_country_id",
        "srch_destination_id",
        "srch_length_of_stay",
        "srch_adults_count",
        "srch_children_count",
        "srch_room_count",
        "srch_saturday_night_bool",
        #"srch_query_affinity_score",
    ],
    item_features=[
        "prop_country_id",
        "prop_starrating",
        "prop_review_score",
        "prop_brand_bool",
        "prop_location_score1",
        "prop_location_score2",
        "prop_log_historical_price",
        "price_usd",
        "promotion_flag",
    ],
)

(interactions, interactions_weights) = dataset.build_interactions(interaction_list)
src_feat_matrix = dataset.build_user_features(src_features)
prop_feat_matrix = dataset.build_item_features(prop_features)


In [51]:
# Initialize and train the model
latent_feats = 30
training_epochs = 30

model = LightFM(loss='warp', no_components=latent_feats)
model.fit(interactions, user_features=src_feat_matrix, item_features=prop_feat_matrix, epochs=training_epochs, num_threads=8, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


<lightfm.lightfm.LightFM at 0x10a086e20>

In [53]:
from scipy.sparse import csr_matrix

# Predicting the score for a user-item pair
user_id = 1
prop_id = 3309

user_feats = None
for key, val in src_features:
    if key == user_id:
        user_feats = list(val.values())
        break

prop_feats = None
for key, val in prop_features:
    if key == prop_id:
        prop_feats = list(val.values())
        break

# Convert feature lists to sparse matrices
# user_feats_matrix = csr_matrix(user_feats).reshape(1, -1)
# prop_feats_matrix = csr_matrix(prop_feats).reshape(1, -1)

current_user_feats = [user_id, {
        "visitor_location_country_id":user_feats[0],
        "srch_destination_id":user_feats[1],
        "srch_length_of_stay":user_feats[2],
        "srch_adults_count":user_feats[3],
        "srch_children_count":user_feats[4],
        "srch_room_count":user_feats[5],
        "srch_saturday_night_bool":user_feats[6],
        #"srch_query_affinity_score",
    }]
current_user_feats = dataset.build_user_features([current_user_feats])

score = model.predict(user_ids=np.array([user_id]), item_ids=np.array([prop_id]), user_features=current_user_feats)
print(f"Predicted score for user {user_id} and item {user_id}: {score[0]}")

# Evaluating the model
from lightfm.evaluation import precision_at_k, auc_score

train_precision = precision_at_k(model, interactions, k=5, user_features=user_features_matrix, item_features=item_features_matrix).mean()
train_auc = auc_score(model, interactions, user_features=user_features_matrix, item_features=item_features_matrix).mean()

print(f'Train precision at k: {train_precision:.2f}')
print(f'Train AUC score: {train_auc:.2f}')


Predicted score for user 1 and item 1: 0.5076165199279785


KeyboardInterrupt: 